{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T21:18:00Z","timestamp":1757625480528,"version":"3.44.0"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783032025470"},{"type":"electronic","value":"9783032025487"}],"license":[{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:00Z","timestamp":1755820800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-02548-7_20","type":"book-chapter","created":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T05:39:03Z","timestamp":1755754743000},"page":"235-246","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Vocoder-Free Non-parallel Conversion of\u00a0Whispered Speech With Masked Cycle-Consistent Generative Adversarial Networks"],"prefix":"10.1007","author":[{"given":"Dominik","family":"Wagner","sequence":"first","affiliation":[]},{"given":"Ilja","family":"Baumann","sequence":"additional","affiliation":[]},{"given":"Tobias","family":"Bocklet","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,22]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","unstructured":"Choi, Y., Choi, M., Kim, M., Ha, J., Kim, S., Choo, J.: StarGAN: unified generative adversarial networks for multi-domain image-to-image translation. In: 2018 IEEE\/CVF CVPR, pp. 8789\u20138797 (2018)","DOI":"10.1109\/CVPR.2018.00916"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Chou, J., Lee, H.: One-shot voice conversion by separating speaker and content representations with instance normalization. In: Proceedings Interspeech, pp. 664\u2013668. ISCA (2019)","DOI":"10.21437\/Interspeech.2019-2663"},{"key":"20_CR3","unstructured":"Dauphin, Y., Fan, A., Auli, M., Grangier, D.: Language modeling with gated convolutional networks. In: ICML 2017, pp. 933\u2013941 (2017)"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Fang, F., Yamagishi, J., Echizen, I., Lorenzo-Trueba, J.: High-quality nonparallel voice conversion based on cycle-consistent adversarial network. In: 2018 IEEE ICASSP, pp. 5279\u20135283 (2018)","DOI":"10.1109\/ICASSP.2018.8462342"},{"key":"20_CR5","doi-asserted-by":"crossref","unstructured":"Gao, T., Pan, Q., Zhou, J., Wang, H., Tao, L., Kwan, H.K.: A novel attention-guided generative adversarial network for whisper-to-normal speech conversion. Cogn. Comput. (2023)","DOI":"10.1007\/s12559-023-10108-9"},{"key":"20_CR6","unstructured":"Garofolo, J., et al.: TIMIT acoustic-phonetic continuous speech corpus LDC93S1. Linguistic Data Consortium (1993)"},{"key":"20_CR7","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. In: NeurIPS, vol. 27 (2014)"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Helander, E., Schwarz, J., Nurminen, J., Silen, H., Gabbouj, M.: On the impact of alignment on voice conversion performance. In: Proceedings Interspeech, pp. 1453\u20131456. ISCA (2008)","DOI":"10.21437\/Interspeech.2008-419"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J., Zhou, T., Efros, A.: Image-to-image translation with conditional adversarial networks. 2017 IEEE CVPR, pp. 5967\u20135976 (2016)","DOI":"10.1109\/CVPR.2017.632"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Kaneko, T., Kameoka, H.: CycleGAN-VC: non-parallel voice conversion using cycle-consistent adversarial networks. In: 26th EUSIPCO, pp. 2100\u20132104 (2018)","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"20_CR12","doi-asserted-by":"crossref","unstructured":"Kaneko, T., Kameoka, H., Tanaka, K., Hojo, N.: CycleGAN-VC2: improved CycleGAN-based non-parallel voice conversion. In: 2019 IEEE Intelligence Conference on Acoustics, Speech and Signal Processing (ICASSP) (2019)","DOI":"10.1109\/ICASSP.2019.8682897"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Kaneko, T., Kameoka, H., Tanaka, K., Hojo, N.: CycleGAN-VC3: Examining and improving CycleGAN-VCs for mel-spectrogram conversion. In: Proceedings Interspeech, pp. 2017\u20132021. ISCA (2020)","DOI":"10.21437\/Interspeech.2020-2280"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Kaneko, T., Kameoka, H., Tanaka, K., Hojo, N.: MaskCycleGAN-VC: Learning non-parallel voice conversion with filling in frames. In: 2021 IEEE ICASSP, pp. 5919\u20135923 (2021)","DOI":"10.1109\/ICASSP39728.2021.9414851"},{"key":"20_CR15","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: 3rd ICLR (2015)"},{"key":"20_CR16","unstructured":"Kong, J., Kim, J., Bae, J.: HiFi-GAN: generative adversarial networks for efficient and high fidelity speech synthesis. In: NeurIPS, vol.\u00a033, pp. 17022\u201317033 (2020)"},{"key":"20_CR17","unstructured":"Kumar, K., et al.: MelGAN: generative adversarial networks for conditional waveform synthesis. In: NeurIPS, vol.\u00a032 (2019)"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Li, C., Wand, M.: Precomputed real-time texture synthesis with Markovian generative adversarial networks. In: ECCV, pp. 702\u2013716 (2016)","DOI":"10.1007\/978-3-319-46487-9_43"},{"key":"20_CR19","unstructured":"Lim, B.P.: Computational differences between whispered and non-whispered speech. Ph.D. thesis, University of Illinois (2010)"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Lin, Y., Chien, C., Lin, J., Lee, H., Lee, L.: Fragmentvc: any-to-any voice conversion by end-to-end extracting and fusing fine-grained voice fragments with attention. In: 2021 IEEE ICASSP, pp. 5939\u20135943 (2021)","DOI":"10.1109\/ICASSP39728.2021.9413699"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Liu, S., Cao, Y., Su, D., Meng, H.: DiffSVC: a diffusion probabilistic model for singing voice conversion. In: 2021 IEEE ASRU, pp. 741\u2013748 (2021)","DOI":"10.1109\/ASRU51503.2021.9688219"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"M., M., Y., F., O., K.: WORLD: a vocoder-based high-quality speech synthesis system for real-time applications. IEICE Trans. Inf. Syst. pp. 1877\u20131884 (2016)","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Malaviya, H., Shah, J., Patel, M., Munshi, J., Patil, H.: Mspec-Net: multi-domain speech conversion network. In: 2020 IEEE ICASSP, pp. 7764\u20137768 (2020)","DOI":"10.1109\/ICASSP40776.2020.9052966"},{"key":"20_CR24","doi-asserted-by":"publisher","unstructured":"Mao, X., Li, Q., Xie, H., Lau, R., Wang, Z., Smolley, S.: Least squares generative adversarial networks. In: 2017 IEEE Intelligence Conference on Computer Vision (ICCV), pp. 2813\u20132821 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.304","DOI":"10.1109\/ICCV.2017.304"},{"key":"20_CR25","doi-asserted-by":"crossref","unstructured":"Meenakshi, G., Ghosh, P.: Whispered speech to neutral speech conversion using bidirectional LSTMs. In: Proceedings Interspeech, pp. 491\u2013495. ISCA (2018)","DOI":"10.21437\/Interspeech.2018-1487"},{"key":"20_CR26","doi-asserted-by":"publisher","first-page":"65","DOI":"10.1016\/j.specom.2017.01.008","volume":"88","author":"SH Mohammadi","year":"2017","unstructured":"Mohammadi, S.H., Kain, A.: An overview of voice conversion systems. Speech Commun. 88, 65\u201382 (2017)","journal-title":"Speech Commun."},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Nguyen, B., Cardinaux, F.: NVC-Net: end-to-end adversarial voice conversion. In: 2022 IEEE ICASSP, pp. 7012\u20137016 (2022)","DOI":"10.1109\/ICASSP43922.2022.9747020"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Parmar, M., Doshi, S., Shah, N., Patel, M., Patil, H.: Effectiveness of cross-domain architectures for whisper-to-normal speech conversion. In: 27th EUSIPCO, pp.\u00a01\u20135 (2019)","DOI":"10.23919\/EUSIPCO.2019.8902961"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Pascual, S., Bonafonte, A., Serr\u00e0 , J., Gonz\u00e1lez L\u00f3pez, J.A.: Whispered-to-voiced Alaryngeal Speech Conversion with Generative Adversarial Networks. In: Proceedings IberSPEECH 2018, pp. 117\u2013121 (2018)","DOI":"10.21437\/IberSPEECH.2018-25"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Patel, M., Purohit, M., Shah, J., Patil, H.: CinC-GAN for effective F0 prediction for whisper-to-normal speech conversion. In: 28th EUSIPCO, pp. 411\u2013415 (2021)","DOI":"10.23919\/Eusipco47968.2020.9287385"},{"key":"20_CR31","unstructured":"Popov, V., Vovk, I., Gogoryan, V., Sadekova, T., Kudinov, M., Wei, J.: Diffusion-based voice conversion with fast maximum likelihood sampling scheme. In: ICLR (2022)"},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Qian, K., Jin, Z., Hasegawa-Johnson, M., Mysore, G.: F0-consistent many-to-many non-parallel voice conversion via conditional autoencoder. In: 2020 IEEE ICASSP, pp. 6284\u20136288 (2020)","DOI":"10.1109\/ICASSP40776.2020.9054734"},{"key":"20_CR33","unstructured":"Qian, K., Zhang, Y., Chang, S., Yang, X., Hasegawa-Johnson, M.: AutoVC: zero-shot voice style transfer with only autoencoder loss. In: ICML, pp. 5210\u20135219 (2019)"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Rekimoto, J.: Wesper: zero-shot and realtime whisper to normal voice conversion for whisper-based speech interactions. In: CHI Conference on Human Factors in Computing Systems (2023)","DOI":"10.1145\/3544548.3580706"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Shi, W., et al.: Real-time single image and video super-resolution using an efficient sub-pixel convolutional neural network. In: 2016 IEEE CVPR, pp. 1874\u20131883 (2016)","DOI":"10.1109\/CVPR.2016.207"},{"key":"20_CR36","doi-asserted-by":"publisher","first-page":"132","DOI":"10.1109\/TASLP.2020.3038524","volume":"29","author":"B Sisman","year":"2021","unstructured":"Sisman, B., Yamagishi, J., King, S., Li, H.: An overview of voice conversion and its challenges: from statistical modeling to deep learning. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 132\u2013157 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"20_CR37","unstructured":"Taigman, Y., Polyak, A., Wolf, L.: Unsupervised cross-domain image generation. In: ICLR (2017)"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Tseng, W., Huang, C., Kao, W., Lin, Y., Lee, H.: Utilizing self-supervised representations for mos prediction. In: Proceedings Interspeech, pp. 2781\u20132785. ISCA (2021)","DOI":"10.21437\/Interspeech.2021-2013"},{"key":"20_CR39","unstructured":"Ulyanov, D., Vedaldi, A., Lempitsky, V.: Instance normalization: the missing ingredient for fast stylization (2016). arXiv"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Wagner, D., Bayerl, S., Maruri, H., Bocklet, T.: Generative models for improved naturalness, intelligibility, and voicing of whispered speech. In: 2022 IEEE SLT, pp. 943\u2013948 (2023)","DOI":"10.1109\/SLT54892.2023.10022796"},{"key":"20_CR41","doi-asserted-by":"publisher","first-page":"1093","DOI":"10.1007\/s10772-024-10161-1","volume":"27","author":"D Wagner","year":"2024","unstructured":"Wagner, D., Baumann, I., Bocklet, T.: Generative adversarial networks for whispered to voiced speech conversion: a comparative study. Int. J. Speech Technol. 27, 1093\u20131110 (2024)","journal-title":"Int. J. Speech Technol."},{"key":"20_CR42","unstructured":"Yamagishi, J., Veaux, C., MacDonald, K.: CSTR VCTK corpus: english multi-speaker corpus for CSTR voice cloning toolkit (2019)"},{"key":"20_CR43","doi-asserted-by":"crossref","unstructured":"Zhu, J., Park, T., Isola, P., Efros, A.: Unpaired image-to-image translation using cycle-consistent adversarial networks. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.244"}],"container-title":["Lecture Notes in Computer Science","Text, Speech, and Dialogue"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-02548-7_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T18:05:30Z","timestamp":1757441130000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-02548-7_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,22]]},"ISBN":["9783032025470","9783032025487"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-02548-7_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025,8,22]]},"assertion":[{"value":"22 August 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"TSD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Text, Speech, and Dialogue","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Erlangen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"tsd2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.kiv.zcu.cz\/tsd2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}