{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T11:02:58Z","timestamp":1740135778333,"version":"3.37.3"},"reference-count":51,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2022,3,31]],"date-time":"2022-03-31T00:00:00Z","timestamp":1648684800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,3,31]],"date-time":"2022-03-31T00:00:00Z","timestamp":1648684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61401227"],"award-info":[{"award-number":["61401227"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Circuits Syst Signal Process"],"published-print":{"date-parts":[[2022,8]]},"DOI":"10.1007\/s00034-022-01998-5","type":"journal-article","created":{"date-parts":[[2022,3,31]],"date-time":"2022-03-31T04:02:45Z","timestamp":1648699365000},"page":"4632-4648","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Non-parallel Voice Conversion Based on Perceptual Star Generative Adversarial Network"],"prefix":"10.1007","volume":"41","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6208-556X","authenticated-orcid":false,"given":"Yanping","family":"Li","sequence":"first","affiliation":[]},{"given":"Xiangtian","family":"Qiu","sequence":"additional","affiliation":[]},{"given":"Pan","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Yan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Bingkun","family":"Bao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,3,31]]},"reference":[{"key":"1998_CR1","unstructured":"M. Abe, S. Nakamura, K. Shikano, H. Kuwabara, Voice conversion through vector quantization, in ICASSP-88, International Conference on Acoustics, Speech, and Signal Processing, vol. 1, pp. 655\u2013658 (1988)"},{"key":"1998_CR2","unstructured":"J.L. Ba, J.R. Kiros, G.E. Hinton, Layer normalization. arXiv preprint arXiv:1607.06450 (2016)"},{"key":"1998_CR3","doi-asserted-by":"crossref","unstructured":"Y. Cao, Z. Liu, M. Chen, J. Ma, S. Wang, J. Xiao, Nonparallel emotional speech conversion using Vae-Gan, in INTERSPEECH (2020)","DOI":"10.21437\/Interspeech.2020-1647"},{"key":"1998_CR4","doi-asserted-by":"crossref","unstructured":"L.-W. Chen, H. Yi Lee, Y. Tsao, Generative adversarial networks for unpaired voice transformation on impaired speech, in INTERSPEECH (2019)","DOI":"10.21437\/Interspeech.2019-1265"},{"key":"1998_CR5","doi-asserted-by":"crossref","unstructured":"Y. Choi, M. Choi, M. Kim, J.-W. Ha, S. Kim, J. Choo, Stargan: unified generative adversarial networks for multi-domain image-to-image translation, in Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8789\u20138797 (2018)","DOI":"10.1109\/CVPR.2018.00916"},{"key":"1998_CR6","doi-asserted-by":"crossref","unstructured":"F. Fang, J. Yamagishi, I. Echizen, J. Lorenzo-Trueba, High-quality nonparallel voice conversion based on cycle-consistent adversarial network, in 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2018), pp. 5279\u20135283","DOI":"10.1109\/ICASSP.2018.8462342"},{"key":"1998_CR7","doi-asserted-by":"crossref","unstructured":"R. Ferro, N. Obin, A. Roebel, Cyclegan voice conversion of spectral envelopes using adversarial weights, in 2020 28th European Signal Processing Conference (EUSIPCO) (2021), pp. 406\u2013410","DOI":"10.23919\/Eusipco47968.2020.9287643"},{"key":"1998_CR8","doi-asserted-by":"crossref","unstructured":"L. Gatys, A.S. Ecker, M. Bethge, Texture synthesis using convolutional neural networks, in Advances in Neural Information Processing Systems (2015), pp. 262\u2013270","DOI":"10.1109\/CVPR.2016.265"},{"key":"1998_CR9","doi-asserted-by":"crossref","unstructured":"L.A. Gatys, A.S. Ecker, M. Bethge, A neural algorithm of artistic style. arXiv preprint arXiv:1508.06576 (2015)","DOI":"10.1167\/16.12.326"},{"key":"1998_CR10","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016), pp. 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"5","key":"1998_CR11","doi-asserted-by":"publisher","first-page":"912","DOI":"10.1109\/TASL.2010.2041699","volume":"18","author":"E Helander","year":"2010","unstructured":"E. Helander, T. Virtanen, J. Nurminen, M. Gabbouj, Voice conversion using partial least squares regression. IEEE Trans. Audio Speech Lang. Process. 18(5), 912\u2013921 (2010)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"1998_CR12","doi-asserted-by":"crossref","unstructured":"C.-C. Hsu, H.-T. Hwang, Y.-C. Wu, Y. Tsao, H.-M. Wang, Voice conversion from non-parallel corpora using variational auto-encoder, in 2016 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA) (IEEE, 2016), pp. 1\u20136","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"1998_CR13","doi-asserted-by":"crossref","unstructured":"C.-C. Hsu, H.-T. Hwang, Y.-C. Wu, Y.\u00a0Tsao, H.-M. Wang, Voice conversion from unaligned corpora using variational autoencoding Wasserstein generative adversarial networks. arXiv preprint arXiv:1704.00849 (2017)","DOI":"10.21437\/Interspeech.2017-63"},{"key":"1998_CR14","doi-asserted-by":"crossref","unstructured":"W.-C. Huang, H.-T. Hwang, Y.-H. Peng, Y.\u00a0Tsao, H.-M. Wang, Voice conversion based on cross-domain features using variational auto encoders, In 2018 11th International Symposium on Chinese Spoken Language Processing (ISCSLP) (IEEE, 2018), pp. 51\u201355","DOI":"10.1109\/ISCSLP.2018.8706604"},{"key":"1998_CR15","unstructured":"S.\u00a0Ioffe, C.\u00a0Szegedy, Batch normalization: accelerating deep network training by reducing internal covariate shift. arXiv preprint arXiv:1502.03167 (2015)"},{"key":"1998_CR16","doi-asserted-by":"crossref","unstructured":"J.\u00a0Johnson, A.\u00a0Alahi, L.\u00a0Fei-Fei, Perceptual losses for real-time style transfer and super-resolution, in European Conference on Computer Vision (Springer, 2016), pp. 694\u2013711","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"1998_CR17","doi-asserted-by":"crossref","unstructured":"H.\u00a0Kameoka, T.\u00a0Kaneko, K.\u00a0Tanaka, N.\u00a0Hojo, Stargan-vc: non-parallel many-to-many voice conversion using star generative adversarial networks, in 2018 IEEE Spoken Language Technology Workshop (SLT) (IEEE, 2018), pp. 266\u2013273","DOI":"10.1109\/SLT.2018.8639535"},{"key":"1998_CR18","doi-asserted-by":"crossref","unstructured":"T.\u00a0Kaneko, H.\u00a0Kameoka, Parallel-data-free voice conversion using cycle-consistent adversarial networks. arXiv preprint arXiv:1711.11293 (2017)","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"1998_CR19","doi-asserted-by":"crossref","unstructured":"T. Kaneko, H. Kameoka, K. Hiramatsu, K. Kashino, Sequence-to-sequence voice conversion with similarity metric learned using generative adversarial networks, in INTERSPEECH 2017 (2017), pp. 1283\u20131287","DOI":"10.21437\/Interspeech.2017-970"},{"key":"1998_CR20","doi-asserted-by":"crossref","unstructured":"T.\u00a0Kaneko, H.\u00a0Kameoka, K.\u00a0Tanaka, N.\u00a0Hojo, Cyclegan-vc2: improved cyclegan-based non-parallel voice conversion, in ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2019), pp. 6820\u20136824","DOI":"10.1109\/ICASSP.2019.8682897"},{"key":"1998_CR21","doi-asserted-by":"crossref","unstructured":"T.\u00a0Kaneko, H.\u00a0Kameoka, K.\u00a0Tanaka, N.\u00a0Hojo, Stargan-vc2: rethinking conditional methods for Stargan-based voice conversion. arXiv preprint arXiv:1907.12279 (2019)","DOI":"10.21437\/Interspeech.2019-2236"},{"key":"1998_CR22","unstructured":"D.P. Kingma, J.\u00a0Ba, Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"1998_CR23","doi-asserted-by":"crossref","unstructured":"J.\u00a0Li, E.\u00a0Chen, Z.\u00a0Ding, L.\u00a0Zhu, K.\u00a0Lu, Z.\u00a0Huang, Cycle-consistent conditional adversarial transfer networks, in Proceedings of the 27th ACM International Conference on Multimedia (2019), pp. 747\u2013755","DOI":"10.1145\/3343031.3350902"},{"key":"1998_CR24","first-page":"2150188:1","volume":"30","author":"Y Li","year":"2021","unstructured":"Y. Li, Z. He, Y. Zhang, Z. Yang, High-quality many-to-many voice conversion using transitive star generative adversarial networks with adaptive instance normalization. J. Circuits Syst. Comput. 30, 2150188:1-2150188:19 (2021)","journal-title":"J. Circuits Syst. Comput."},{"key":"1998_CR25","doi-asserted-by":"crossref","unstructured":"Y.\u00a0Li, D.\u00a0Xu, Y.\u00a0Zhang, Y.\u00a0Wang, B.\u00a0Chen, Non-parallel many-to-many voice conversion with PSR-STARGAN, in INTERSPEECH (2020)","DOI":"10.21437\/Interspeech.2020-1310"},{"key":"1998_CR26","doi-asserted-by":"crossref","unstructured":"K.\u00a0Liu, J.\u00a0Zhang, Y.\u00a0Yan, High quality voice conversion through phoneme-based linear mapping functions with straight for mandarin, in Fourth International Conference on Fuzzy Systems and Knowledge Discovery (FSKD 2007), vol. 4 (IEEE, 2007), pp. 410\u2013414","DOI":"10.1109\/FSKD.2007.347"},{"key":"1998_CR27","doi-asserted-by":"crossref","unstructured":"J.\u00a0Lorenzo-Trueba, J.\u00a0Yamagishi, T.\u00a0Toda, D.\u00a0Saito, F.\u00a0Villavicencio, T.\u00a0Kinnunen, Z.\u00a0Ling, The voice conversion challenge 2018: promoting development of parallel and nonparallel methods. arXiv preprint arXiv:1804.04262 (2018)","DOI":"10.21437\/Odyssey.2018-28"},{"key":"1998_CR28","unstructured":"J.\u00a0Lu, K.\u00a0Zhou, B.\u00a0Sisman, H.\u00a0Li, Vaw-gan for singing voice conversion with non-parallel training data, in 2020 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC) (2020), pp. 514\u2013519"},{"key":"1998_CR29","unstructured":"P.\u00a0Luo, R.\u00a0Zhang, J.\u00a0Ren, Z.\u00a0Peng, J.\u00a0Li, Switchable normalization for learning-to-normalize deep representation. IEEE Trans. Pattern Anal. Mach. Intell. (2019)"},{"key":"1998_CR30","doi-asserted-by":"crossref","unstructured":"H.\u00a0Ming, D.\u00a0Huang, L.\u00a0Xie, J.\u00a0Wu, M.\u00a0Dong, H.\u00a0Li, Deep bidirectional LSTM modeling of timbre and prosody for emotional voice conversion, in Interspeech (2016)","DOI":"10.21437\/Interspeech.2016-1053"},{"key":"1998_CR31","doi-asserted-by":"crossref","unstructured":"H.\u00a0Miyoshi, Y.\u00a0Saito, S.\u00a0Takamichi, H.\u00a0Saruwatari, Voice conversion using sequence-to-sequence learning of context posterior probabilities. arXiv preprint arXiv:1704.02360 (2017)","DOI":"10.21437\/Interspeech.2017-247"},{"issue":"7","key":"1998_CR32","doi-asserted-by":"publisher","first-page":"1877","DOI":"10.1587\/transinf.2015EDP7457","volume":"99","author":"M Morise","year":"2016","unstructured":"M. Morise, F. Yokomori, K. Ozawa, World: a vocoder-based high-quality speech synthesis system for real-time applications. IEICE Trans. Inf. Syst. 99(7), 1877\u20131884 (2016)","journal-title":"IEICE Trans. Inf. Syst."},{"key":"1998_CR33","doi-asserted-by":"publisher","first-page":"134","DOI":"10.1016\/j.specom.2011.07.007","volume":"54","author":"K Nakamura","year":"2012","unstructured":"K. Nakamura, T. Toda, H. Saruwatari, K. Shikano, Speaking-aid systems using GMM-based voice conversion for electrolaryngeal speech. Speech Commun. 54, 134\u2013146 (2012)","journal-title":"Speech Commun."},{"key":"1998_CR34","doi-asserted-by":"crossref","unstructured":"J.\u00a0Parekh, P.\u00a0Rao, Y.-H. Yang, Speech-to-singing conversion in an encoder-decoder framework, in ICASSP 2020\u20142020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020), pp. 261\u2013265","DOI":"10.1109\/ICASSP40776.2020.9054473"},{"key":"1998_CR35","doi-asserted-by":"crossref","unstructured":"H.\u00a0Ren, M.\u00a0El-Khamy, J.\u00a0Lee, DN-RESNET: efficient deep residual network for image denoising, in Asian Conference on Computer Vision (Springer, 2018), pp. 215\u2013230","DOI":"10.1007\/978-3-030-20873-8_14"},{"issue":"3","key":"1998_CR36","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"O. Russakovsky, J. Deng, H. Su, J. Krause, S. Satheesh, S. Ma, Z. Huang, A. Karpathy, A. Khosla, M. Bernstein et al., Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"1998_CR37","doi-asserted-by":"crossref","unstructured":"Y.\u00a0Saito, Y.\u00a0Ijima, K.\u00a0Nishida, S.\u00a0Takamichi, Non-parallel voice conversion using variational autoencoders conditioned by phonetic posteriorgrams and d-vectors, in 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2018), pp. 5274\u20135278","DOI":"10.1109\/ICASSP.2018.8461384"},{"key":"1998_CR38","unstructured":"K.\u00a0Simonyan, A.\u00a0Zisserman, Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"issue":"2","key":"1998_CR39","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1109\/89.661472","volume":"6","author":"Y Stylianou","year":"1998","unstructured":"Y. Stylianou, O. Capp\u00e9, E. Moulines, Continuous probabilistic transform for voice conversion. IEEE Trans. Speech Audio Process. 6(2), 131\u2013142 (1998)","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"1998_CR40","doi-asserted-by":"crossref","unstructured":"L.\u00a0Sun, S.\u00a0Kang, K.\u00a0Li, H.\u00a0Meng, Voice conversion using deep bidirectional long short-term memory based recurrent neural networks, in 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2015), pp. 4869\u20134873","DOI":"10.1109\/ICASSP.2015.7178896"},{"key":"1998_CR41","doi-asserted-by":"crossref","unstructured":"L.\u00a0Sun, K.\u00a0Li, H.\u00a0Wang, S.\u00a0Kang, H.\u00a0Meng, Phonetic posteriorgrams for many-to-one voice conversion without parallel data training, in 2016 IEEE International Conference on Multimedia and Expo (ICME) (IEEE, 2016), pp. 1\u20136","DOI":"10.1109\/ICME.2016.7552917"},{"key":"1998_CR42","doi-asserted-by":"crossref","unstructured":"K.\u00a0Tanaka, H.\u00a0Kameoka, T.\u00a0Kaneko, N.\u00a0Hojo, Atts2s-vc: sequence-to-sequence voice conversion with attention and context preservation mechanisms, In ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (IEEE, 2019), pp. 6805\u20136809","DOI":"10.1109\/ICASSP.2019.8683282"},{"key":"1998_CR43","unstructured":"D.\u00a0Ulyanov, A.\u00a0Vedaldi, V.\u00a0Lempitsky, Instance normalization: the missing ingredient for fast stylization. arXiv preprint arXiv:1607.08022 (2016)"},{"key":"1998_CR44","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1109\/MSP.2018.2875195","volume":"36","author":"K Vijayan","year":"2019","unstructured":"K. Vijayan, H. Li, T. Toda, Speech-to-singing voice conversion: the challenges and strategies for improving vocal conversion processes. IEEE Signal Process. Mag. 36, 95\u2013102 (2019)","journal-title":"IEEE Signal Process. Mag."},{"key":"1998_CR45","doi-asserted-by":"crossref","unstructured":"R.\u00a0Wang, Y.\u00a0Ding, L.\u00a0Li, C.\u00a0Fan, One-shot voice conversion using STAR-GAN, in ICASSP 2020\u20142020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2020), pp. 7729\u20137733","DOI":"10.1109\/ICASSP40776.2020.9053842"},{"key":"1998_CR46","doi-asserted-by":"crossref","unstructured":"D.\u00a0Wu, Y.-H. Yang, Speech-to-singing conversion based on boundary equilibrium GAN, in INTERSPEECH (2020)","DOI":"10.21437\/Interspeech.2020-1984"},{"key":"1998_CR47","doi-asserted-by":"crossref","unstructured":"Y.\u00a0Wu, K.\u00a0He, Group normalization, in Proceedings of the European Conference on Computer Vision (ECCV) (2018), pages 3\u201319","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"1998_CR48","doi-asserted-by":"crossref","unstructured":"M.\u00a0Zhang, B.\u00a0Sisman, S.S. Rallabandi, H.\u00a0Li, L.\u00a0Zhao, Error reduction network for DBLSTM-based voice conversion, in 2018 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC) (IEEE, 2018), pp. 823\u2013828","DOI":"10.23919\/APSIPA.2018.8659543"},{"key":"1998_CR49","doi-asserted-by":"crossref","unstructured":"S.\u00a0Zhao, T.\u00a0H. Nguyen, H.\u00a0Wang, B.\u00a0Ma, Fast learning for non-parallel many-to-many voice conversion with residual star generative adversarial networks, in Interspeech (2019), pp. 689\u2013693","DOI":"10.21437\/Interspeech.2019-2067"},{"key":"1998_CR50","doi-asserted-by":"crossref","unstructured":"T.\u00a0Zhou, P.\u00a0Krahenbuhl, M.\u00a0Aubry, Q.\u00a0Huang, A.A. Efros, Learning dense correspondence via 3D-guided cycle consistency, in Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 117\u2013126 (2016)","DOI":"10.1109\/CVPR.2016.20"},{"key":"1998_CR51","doi-asserted-by":"crossref","unstructured":"J.-Y. Zhu, T.\u00a0Park, P.\u00a0Isola, A.A. Efros, Unpaired image-to-image translation using cycle-consistent adversarial networks, in Proceedings of the IEEE International Conference on Computer Vision (2017), pp. 2223\u20132232","DOI":"10.1109\/ICCV.2017.244"}],"container-title":["Circuits, Systems, and Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-022-01998-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00034-022-01998-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00034-022-01998-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,22]],"date-time":"2022-06-22T16:41:58Z","timestamp":1655916118000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00034-022-01998-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,31]]},"references-count":51,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2022,8]]}},"alternative-id":["1998"],"URL":"https:\/\/doi.org\/10.1007\/s00034-022-01998-5","relation":{},"ISSN":["0278-081X","1531-5878"],"issn-type":[{"type":"print","value":"0278-081X"},{"type":"electronic","value":"1531-5878"}],"subject":[],"published":{"date-parts":[[2022,3,31]]},"assertion":[{"value":"9 October 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 February 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 February 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 March 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}