{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T02:37:09Z","timestamp":1743043029544,"version":"3.40.3"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031251146"},{"type":"electronic","value":"9783031251153"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-25115-3_4","type":"book-chapter","created":{"date-parts":[[2023,1,28]],"date-time":"2023-01-28T13:57:23Z","timestamp":1674914243000},"page":"53-67","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Voice Conversion Using Learnable Similarity-Guided Masked Autoencoder"],"prefix":"10.1007","author":[{"given":"Yewei","family":"Gu","sequence":"first","affiliation":[]},{"given":"Xianfeng","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Xiaowei","family":"Yi","sequence":"additional","affiliation":[]},{"given":"Junchao","family":"Xiao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,1,29]]},"reference":[{"key":"4_CR1","first-page":"12449","volume":"33","author":"A Baevski","year":"2020","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: Wav2Vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Chen, Y.H., Wu, D.Y., Wu, T.H., Lee, H.Y.: Again-VC: a one-shot voice conversion using activation guidance and adaptive instance normalization. In: ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5954\u20135958. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9414257"},{"key":"4_CR3","doi-asserted-by":"crossref","unstructured":"Choi, Y., Choi, M., Kim, M., Ha, J.W., Kim, S., Choo, J.: StarGAN: unified generative adversarial networks for multi-domain image-to-image translation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8789\u20138797 (2018)","DOI":"10.1109\/CVPR.2018.00916"},{"key":"4_CR4","doi-asserted-by":"crossref","unstructured":"Chou, J.C., Yeh, C.C., Lee, H.Y.: One-shot voice conversion by separating speaker and content representations with instance normalization. arXiv preprint arXiv:1904.05742 (2019)","DOI":"10.21437\/Interspeech.2019-2663"},{"issue":"5","key":"4_CR5","doi-asserted-by":"publisher","first-page":"922","DOI":"10.1109\/TASL.2009.2038663","volume":"18","author":"D Erro","year":"2009","unstructured":"Erro, D., Moreno, A., Bonafonte, A.: Voice conversion based on weighted frequency warping. IEEE Trans. Audio Speech Lang. Process. 18(5), 922\u2013931 (2009)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"4_CR6","unstructured":"Gu, Y., Zhang, Z., Yi, X., Zhao, X.: MediumVC: any-to-any voice conversion using synthetic specific-speaker speeches as intermedium features. arXiv preprint arXiv:2110.02500 (2021)"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"issue":"3","key":"4_CR8","doi-asserted-by":"publisher","first-page":"806","DOI":"10.1109\/TASL.2011.2165944","volume":"20","author":"E Helander","year":"2011","unstructured":"Helander, E., Sil\u00e9n, H., Virtanen, T., Gabbouj, M.: Voice conversion using dynamic kernel partial least squares regression. IEEE Trans. Audio Speech Lang. Process. 20(3), 806\u2013817 (2011)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"4_CR9","doi-asserted-by":"crossref","unstructured":"Hsu, C.C., Hwang, H.T., Wu, Y.C., Tsao, Y., Wang, H.M.: Voice conversion from non-parallel corpora using variational auto-encoder. In: 2016 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA), pp. 1\u20136. IEEE (2016)","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"4_CR10","doi-asserted-by":"crossref","unstructured":"Hsu, C.C., Hwang, H.T., Wu, Y.C., Tsao, Y., Wang, H.M.: Voice conversion from unaligned corpora using variational autoencoding wasserstein generative adversarial networks. arXiv preprint arXiv:1704.00849 (2017)","DOI":"10.21437\/Interspeech.2017-63"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Huang, X., Belongie, S.: Arbitrary style transfer in real-time with adaptive instance normalization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1501\u20131510 (2017)","DOI":"10.1109\/ICCV.2017.167"},{"key":"4_CR12","first-page":"4485","volume":"31","author":"Y Jia","year":"2018","unstructured":"Jia, Y., et al.: Transfer learning from speaker verification to multispeaker text-to-speech synthesis. Adv. Neural Inf. Process. Syst. 31, 4485\u20134495 (2018)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Kameoka, H., Kaneko, T., Tanaka, K., Hojo, N.: ACVAE-VC: non-parallel many-to-many voice conversion with auxiliary classifier variational autoencoder. arXiv preprint arXiv:1808.05092 (2018)","DOI":"10.1109\/TASLP.2019.2917232"},{"key":"4_CR14","doi-asserted-by":"crossref","unstructured":"Kameoka, H., Kaneko, T., Tanaka, K., Hojo, N.: StarGAN-VC: non-parallel many-to-many voice conversion using star generative adversarial networks. In: 2018 IEEE Spoken Language Technology Workshop (SLT), pp. 266\u2013273. IEEE (2018)","DOI":"10.1109\/SLT.2018.8639535"},{"key":"4_CR15","doi-asserted-by":"crossref","unstructured":"Kaneko, T., Kameoka, H.: CycleGAN-VC: non-parallel voice conversion using cycle-consistent adversarial networks. In: 2018 26th European Signal Processing Conference (EUSIPCO), pp. 2100\u20132104. IEEE (2018)","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"4_CR16","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"4_CR17","first-page":"294","volume":"34","author":"SH Lee","year":"2021","unstructured":"Lee, S.H., Kim, J.H., Chung, H., Lee, S.W.: VoiceMixer: adversarial voice style mixup. Adv. Neural. Inf. Process. Syst. 34, 294\u2013308 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"4_CR18","doi-asserted-by":"crossref","unstructured":"Lin, Y.Y., Chien, C.M., Lin, J.H., Lee, H.Y., Lee, L.S.: FragmentVC: any-to-any voice conversion by end-to-end extracting and fusing fine-grained voice fragments with attention. In: ICASSP 2021\u20132021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5939\u20135943. IEEE (2021)","DOI":"10.1109\/ICASSP39728.2021.9413699"},{"key":"4_CR19","doi-asserted-by":"crossref","unstructured":"Luan, Y., Saito, D., Kashiwagi, Y., Minematsu, N., Hirose, K.: Semi-supervised noise dictionary adaptation for exemplar-based noise robust speech recognition. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1745\u20131748. IEEE (2014)","DOI":"10.1109\/ICASSP.2014.6853897"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: LibriSpeech: an ASR corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"4_CR21","unstructured":"Qian, K., Zhang, Y., Chang, S., Yang, X., Hasegawa-Johnson, M.: AutoVC: zero-shot voice style transfer with only autoencoder loss. In: International Conference on Machine Learning, pp. 5210\u20135219. PMLR (2019)"},{"key":"4_CR22","unstructured":"Ren, Y., et al.: FastSpeech 2: fast and high-quality end-to-end text to speech. arXiv preprint arXiv:2006.04558 (2020)"},{"key":"4_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Shi, Y., Bu, H., Xu, X., Zhang, S., Li, M.: Aishell-3: a multi-speaker mandarin tts corpus and the baselines. arXiv preprint arXiv:2010.11567 (2020)","DOI":"10.21437\/Interspeech.2021-755"},{"issue":"10","key":"4_CR25","doi-asserted-by":"publisher","first-page":"1863","DOI":"10.1109\/TASLP.2017.2723721","volume":"25","author":"X Tian","year":"2017","unstructured":"Tian, X., Lee, S.W., Wu, Z., Chng, E.S., Li, H.: An exemplar-based approach to frequency warping for voice conversion. IEEE\/ACM Trans. Audio Speech Lang. Process. 25(10), 1863\u20131876 (2017)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"8","key":"4_CR26","doi-asserted-by":"publisher","first-page":"2222","DOI":"10.1109\/TASL.2007.907344","volume":"15","author":"T Toda","year":"2007","unstructured":"Toda, T., Black, A.W., Tokuda, K.: Voice conversion based on maximum-likelihood estimation of spectral parameter trajectory. IEEE Trans. Audio Speech Lang. Process. 15(8), 2222\u20132235 (2007)","journal-title":"IEEE Trans. Audio Speech Lang. Process."},{"key":"4_CR27","unstructured":"Ulyanov, D., Vedaldi, A., Lempitsky, V.: Instance normalization: the missing ingredient for fast stylization. arXiv preprint arXiv:1607.08022 (2016)"},{"key":"4_CR28","unstructured":"Veaux, C., Yamagishi, J., MacDonald, K., et al.: Superseded-CSTR VCTK corpus: English multi-speaker corpus for CSTR voice cloning toolkit (2016)"},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Wan, L., Wang, Q., Papir, A., Moreno, I.L.: Generalized end-to-end loss for speaker verification. In: 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4879\u20134883. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Wu, D.Y., Chen, Y.H., Lee, H.Y.: VQVC+: one-shot voice conversion by vector quantization and U-net architecture. arXiv preprint arXiv:2006.04154 (2020)","DOI":"10.21437\/Interspeech.2020-1443"},{"key":"4_CR31","unstructured":"Zhao, Y., et al.: Voice conversion challenge 2020: intra-lingual semi-parallel and cross-lingual voice conversion. arXiv preprint arXiv:2008.12527 (2020)"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Zhu, J.Y., Park, T., Isola, P., Efros, A.A.: Unpaired image-to-image translation using cycle-consistent adversarial networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2223\u20132232 (2017)","DOI":"10.1109\/ICCV.2017.244"}],"container-title":["Lecture Notes in Computer Science","Digital Forensics and Watermarking"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-25115-3_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,28]],"date-time":"2023-01-28T13:58:17Z","timestamp":1674914297000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-25115-3_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031251146","9783031251153"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-25115-3_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"29 January 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"IWDW","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Workshop on Digital Watermarking","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Guilin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 November 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 November 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iwdw2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iwdw.site\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"30","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"14","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"47% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.23","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1.43","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}