{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T17:10:04Z","timestamp":1755882604373,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":16,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T00:00:00Z","timestamp":1709251200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,3]]},"DOI":"10.1145\/3672919.3672988","type":"proceedings-article","created":{"date-parts":[[2024,7,24]],"date-time":"2024-07-24T12:39:43Z","timestamp":1721824783000},"page":"375-379","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-lingual voice conversion based on F0 multi-scale modeling with VITS"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9779-9466","authenticated-orcid":false,"given":"Danyang","family":"Cao","sequence":"first","affiliation":[{"name":"School of information and technology, North China University of Technology, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5529-5894","authenticated-orcid":false,"given":"Zeyi","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Information and Technology, North China University of Technology, China"}]}],"member":"320","published-online":{"date-parts":[[2024,7,24]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"6794","volume-title":"Speech and Signal Processing (ICASSP). IEEE","author":"Zhou Yi","year":"2019","unstructured":"Yi Zhou, Xiaohai Tian, Haihua Xu, Rohan Kumar Das, and Haizhou Li, \"Cross-lingual voice conversion with bilingual phonetic posteriorgram and average modeling,\" in ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2019, pp. 6790\u20136794."},{"key":"e_1_3_2_1_2_1","first-page":"5","volume-title":"ICASSP 2023- 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Li Jingyi","unstructured":"Jingyi Li, Weiping Tu, and Li Xiao. Freevc: Towards high-quality text-free one-shot voice conversion. In ICASSP 2023- 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pages 1\u20135. IEEE, 2023."},{"key":"e_1_3_2_1_3_1","volume-title":"Carlos Toshinori Ishi, and Hiroshi Ishiguro. Quickvc:Many-to-any voice conversion using inverse short-time fourier transform for faster conversion. arXiv preprint arXiv:2302.08296","author":"Guo Houjian","year":"2023","unstructured":"Houjian Guo, Chaoran Liu, Carlos Toshinori Ishi, and Hiroshi Ishiguro. Quickvc:Many-to-any voice conversion using inverse short-time fourier transform for faster conversion. arXiv preprint arXiv:2302.08296, 2023."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Disong Wang Liqun Deng Yu Ting Yeung Xiao Chen Xunying Liu and Helen Meng. Vqmivc: Vector quantization and mutual information-based unsupervised speech representation disentanglement for one-shot voice conversion. arXiv preprint arXiv:2106.10132 2021.","DOI":"10.21437\/Interspeech.2021-283"},{"key":"e_1_3_2_1_5_1","first-page":"4783","volume-title":"2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)","author":"Shen Jonathan","unstructured":"Jonathan Shen, Ruoming Pang, Ron J Weiss, Mike Schuster, Navdeep Jaitly, Zongheng Yang, Zhifeng Chen, Yu Zhang, Yuxuan Wang, Rj Skerrv- Ryan, Natural tts synthesis by conditioning wavenet on mel spectrogram predictions. In 2018 IEEE international conference on acoustics, speech and signal processing (ICASSP), pages 4779\u20134783. IEEE, 2018."},{"key":"e_1_3_2_1_6_1","volume-title":"Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558","author":"Ren Yi","year":"2020","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and TieYan Liu. Fastspeech 2: Fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558, 2020."},{"key":"e_1_3_2_1_7_1","first-page":"5540","volume-title":"International Conference on Machine Learning","author":"Kim Jaehyeon","unstructured":"Jaehyeon Kim, Jungil Kong, and Juhee Son. Conditional variational autoencoder with adversarial learning for end-to-end text-tospeech. In International Conference on Machine Learning, pages 5530\u20135540. PMLR, 2021."},{"key":"e_1_3_2_1_8_1","first-page":"1969","article-title":"Frame alignment method for cross-lingual voice conversion","author":"Erro Daniel","year":"2007","unstructured":"Daniel Erro and Asunci\u00f3n Moreno, \"Frame alignment method for cross-lingual voice conversion,\" in INTERSPEECH, 2007, pp. 1969\u2013 1972.","journal-title":"INTERSPEECH"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2009.2038669"},{"key":"e_1_3_2_1_10_1","first-page":"676","article-title":"Vtln-based cross-language voice conversion","author":"Sundermann David","year":"2003","unstructured":"David Sundermann, Hermann Ney, and H Hoge, \"Vtln-based cross-language voice conversion,\" in IEEE ASRU, 2003, pp. 676\u2013681.","journal-title":"IEEE ASRU"},{"key":"e_1_3_2_1_11_1","first-page":"5120","article-title":"A frame mapping based hmm approach to cross-lingual voice transformation","author":"Qian Yao","year":"2011","unstructured":"Yao Qian, Ji Xu, and Frank K Soong, \"A frame mapping based hmm approach to cross-lingual voice transformation,\" in IEEE ICASSP, 2011, pp. 5120\u20135123.","journal-title":"IEEE ICASSP"},{"key":"e_1_3_2_1_12_1","first-page":"04356","article-title":"Robust speech recognition via large-scale weak supervision","volume":"2212","author":"Radford Alec","year":"2022","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever, \"Robust speech recognition via large-scale weak supervision,\" CoRR, vol. abs\/2212.04356, 2022.","journal-title":"CoRR"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3076867"},{"key":"e_1_3_2_1_14_1","first-page":"1","volume-title":"Taipei","author":"Zhou Y.","year":"2023","unstructured":"Y. Zhou, M. Chen, Y. Lei, J. Zhu and W. Zhao, \"VITS-Based Singing Voice Conversion System with DSPGAN Post-Processing for SVCC2023,\" 2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), Taipei, Taiwan, 2023, pp. 1-8."},{"key":"e_1_3_2_1_15_1","first-page":"415","volume-title":"TASLP","volume":"28","author":"Wang Xin","year":"2020","unstructured":"Xin Wang, Shinji Takaki, and Junichi Yamagishi, \"Neural source-filter waveform models for statistical parametric speech synthesis,\" Proc. TASLP, vol. 28, pp. 402\u2013415, 2020."},{"key":"e_1_3_2_1_16_1","first-page":"1","volume-title":"Taipei","author":"Guo H.","year":"2023","unstructured":"H. Guo, C. Liu, C. T. Ishi and H. Ishiguro, \"Using Joint Training Speaker Encoder With Consistency Loss to Achieve Cross-Lingual Voice Conversion and Expressive Voice Conversion,\" 2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), Taipei, Taiwan, 2023, pp. 1-8."}],"event":{"name":"CSAIDE 2024: 2024 3rd International Conference on Cyber Security, Artificial Intelligence and Digital Economy","acronym":"CSAIDE 2024","location":"Nanjing China"},"container-title":["Proceedings of the 2024 3rd International Conference on Cyber Security, Artificial Intelligence and Digital Economy"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3672919.3672988","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3672919.3672988","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T16:35:25Z","timestamp":1755880525000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3672919.3672988"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3]]},"references-count":16,"alternative-id":["10.1145\/3672919.3672988","10.1145\/3672919"],"URL":"https:\/\/doi.org\/10.1145\/3672919.3672988","relation":{},"subject":[],"published":{"date-parts":[[2024,3]]},"assertion":[{"value":"2024-07-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}