{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,16]],"date-time":"2026-05-16T16:19:39Z","timestamp":1778948379161,"version":"3.51.4"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T00:00:00Z","timestamp":1771891200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T00:00:00Z","timestamp":1771891200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"The Industry-university Cooperation and Collaborative Education Project of the Ministry of Education","award":["231003221260825"],"award-info":[{"award-number":["231003221260825"]}]},{"name":"The 2024 Graduate Education Reform Project of Shanghai Dianji University","award":["A102252401102029"],"award-info":[{"award-number":["A102252401102029"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11760-026-05196-7","type":"journal-article","created":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T08:45:40Z","timestamp":1771922740000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["MTC-VC: A Multi-Task Contrastive Learning Method for Controllable and Efficiency-Balanced Voice Cloning"],"prefix":"10.1007","volume":"20","author":[{"given":"Rui","family":"Zhou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yan","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Baili","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,24]]},"reference":[{"key":"5196_CR1","doi-asserted-by":"crossref","unstructured":"Kadam, S., Jikamade, A., Mattoo, P., Hole, V.: Revoice: A neural network based voice cloning system. In: 2024 IEEE 9th International Conference for Convergence in Technology (I2CT), pp. 1\u20136 (2024). IEEE","DOI":"10.1109\/I2CT61223.2024.10543448"},{"key":"5196_CR2","doi-asserted-by":"crossref","unstructured":"Xie, T., Rong, Y., Zhang, P., Liu, L.: Towards controllable speech synthesis in the era of large language models: A survey (2024). arXiv preprint arXiv:2412.06602","DOI":"10.18653\/v1\/2025.emnlp-main.40"},{"key":"5196_CR3","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1109\/TASLP.2022.3145293","volume":"30","author":"Y Lei","year":"2022","unstructured":"Lei, Y., Yang, S., Wang, X., Xie, L.: Msemotts: Multi-scale emotion transfer, prediction, and control for emotional speech synthesis. IEEE\/ACM Trans. Audio Speech Lang. Process. 30, 853\u2013864 (2022)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"5196_CR4","unstructured":"Ren, Y., Hu, C., Tan, X., Qin, T., Zhao, S., Zhao, Z., Liu, T.-Y.: Fastspeech 2: Fast and high-quality end-to-end text to speech (2020). arXiv preprint arXiv:2006.04558"},{"key":"5196_CR5","doi-asserted-by":"publisher","first-page":"1506","DOI":"10.1109\/TASLP.2024.3363444","volume":"32","author":"X Zhu","year":"2024","unstructured":"Zhu, X., Lei, Y., Li, T., Zhang, Y., Zhou, H., Lu, H., Xie, L.: Metts: Multilingual emotional text-to-speech by cross-speaker and cross-lingual emotion transfer. IEEE\/ACM Trans. Audio Speech Lang. Process. 32, 1506\u20131518 (2024)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"5196_CR6","doi-asserted-by":"crossref","unstructured":"Wang, J., Li, J., Zhao, X., Wu, Z., Kang, S., Meng, H.: Adversarially learning disentangled speech representations for robust multi-factor voice conversion (2021). arXiv preprint arXiv:2102.00184","DOI":"10.21437\/Interspeech.2021-1990"},{"issue":"19","key":"5196_CR7","doi-asserted-by":"publisher","first-page":"6213","DOI":"10.3390\/s24196213","volume":"24","author":"C Go","year":"2024","unstructured":"Go, C., Lee, Y.H., Kim, T., Park, N.I., Chun, C.: Contrastive speaker representation learning with hard negative sampling for speaker recognition. Sensors 24(19), 6213 (2024)","journal-title":"Sensors"},{"key":"5196_CR8","doi-asserted-by":"publisher","first-page":"4263","DOI":"10.1109\/TASLP.2024.3453598","volume":"32","author":"Y Li","year":"2024","unstructured":"Li, Y., Yu, C., Sun, G., Zu, W., Tian, Z., Wen, Y., Pan, W., Zhang, C., Wang, J., Yang, Y., et al.: Cross-utterance conditioned vae for speech generation. IEEE\/ACM Trans. Audio Speech Lang. Process. 32, 4263\u20134276 (2024)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"5196_CR9","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101869","volume":"99","author":"A Mehrish","year":"2023","unstructured":"Mehrish, A., Majumder, N., Bharadwaj, R., Mihalcea, R., Poria, S.: A review of deep learning techniques for speech processing. Information Fusion 99, 101869 (2023)","journal-title":"Information Fusion"},{"issue":"20","key":"5196_CR10","doi-asserted-by":"publisher","first-page":"9595","DOI":"10.3390\/app14209595","volume":"14","author":"D Yook","year":"2024","unstructured":"Yook, D., Han, G., Chang, H.-P., Yoo, I.-C.: Cyclediffusion: Voice conversion using cycle-consistent diffusion models. Appl. Sci. 14(20), 9595 (2024)","journal-title":"Appl. Sci."},{"issue":"16","key":"5196_CR11","doi-asserted-by":"publisher","first-page":"7283","DOI":"10.3390\/s23167283","volume":"23","author":"Y Deng","year":"2023","unstructured":"Deng, Y., Wu, N., Qiu, C., Chen, Y., Gao, X.: Research on speech synthesis based on mixture alignment mechanism. Sensors 23(16), 7283 (2023)","journal-title":"Sensors"},{"key":"5196_CR12","doi-asserted-by":"publisher","first-page":"1720","DOI":"10.1109\/TASLP.2023.3268730","volume":"31","author":"D Yang","year":"2023","unstructured":"Yang, D., Yu, J., Wang, H., Wang, W., Weng, C., Zou, Y., Yu, D.: Diffsound: Discrete diffusion model for text-to-sound generation. IEEE\/ACM Trans. Audio Speech Lang. Process. 31, 1720\u20131733 (2023)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"30","key":"5196_CR13","doi-asserted-by":"publisher","first-page":"46905","DOI":"10.1007\/s11042-023-15279-z","volume":"82","author":"W Zhao","year":"2023","unstructured":"Zhao, W., Lian, Y., Chai, J., Tu, Z.: Multi-speaker chinese news broadcasting system based on improved tacotron2. Multimed. Tools Appl. 82(30), 46905\u201346937 (2023)","journal-title":"Multimed. Tools Appl."},{"key":"5196_CR14","doi-asserted-by":"crossref","unstructured":"Yao, Y., Liang, T., Feng, R., Shi, K., Yu, J., Wang, W., Li, J.: Sr-tts: A rhyme-based end-to-end speech synthesis system. Front. Neurorobot. 18, 1322312 (2024)","DOI":"10.3389\/fnbot.2024.1322312"},{"key":"5196_CR15","unstructured":"Guo, H., Liu, C., Ishi, C.T., Ishiguro, H.: Quickvc: Any-to-many voice conversion using inverse short-time fourier transform for faster conversion (2023). arXiv preprint arXiv:2302.08296"},{"key":"5196_CR16","doi-asserted-by":"crossref","unstructured":"Hu, G., Ruan, Z., Guo, W., Quan, Y.: A multi-task learning speech synthesis optimization method based on cwt: A case study of tacotron2. EURASIP J. Adv. Signal Process. 2024(1), 4 (2024)","DOI":"10.1186\/s13634-023-01096-x"},{"issue":"1","key":"5196_CR17","doi-asserted-by":"publisher","first-page":"2189217","DOI":"10.1080\/09540091.2023.2189217","volume":"35","author":"K Mao","year":"2023","unstructured":"Mao, K., Wang, Y., Ren, L., Zhang, J., Qiu, J., Dai, G.: Multi-branch feature learning based speech emotion recognition using scar-net. Connect. Sci. 35(1), 2189217 (2023)","journal-title":"Connect. Sci."},{"issue":"3","key":"5196_CR18","doi-asserted-by":"publisher","first-page":"168","DOI":"10.1007\/s11063-024-11614-z","volume":"56","author":"J Zhao","year":"2024","unstructured":"Zhao, J., Li, R., Tian, M., An, W.: Multi-view self-supervised learning and multi-scale feature fusion for automatic speech recognition. Neural Process. Lett. 56(3), 168 (2024)","journal-title":"Neural Process. Lett."},{"issue":"1","key":"5196_CR19","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1186\/s13636-024-00359-1","volume":"2024","author":"X Feng","year":"2024","unstructured":"Feng, X., Zhao, Y., Zong, W., Xu, X.: Adaptive multi-task learning for speech to text translation. EURASIP J. Audio Speech Music Process. 2024(1), 36 (2024)","journal-title":"EURASIP J. Audio Speech Music Process."},{"issue":"1","key":"5196_CR20","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1186\/s13636-023-00292-9","volume":"2023","author":"Y Zhou","year":"2023","unstructured":"Zhou, Y., Wan, H.: Dual-branch attention module-based network with parameter sharing for joint sound event detection and localization. EURASIP J. Audio Speech Music Process. 2023(1), 27 (2023)","journal-title":"EURASIP J. Audio Speech Music Process."},{"key":"5196_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.specom.2022.12.004","volume":"147","author":"AA Joshy","year":"2023","unstructured":"Joshy, A.A., Rajan, R.: Dysarthria severity classification using multi-head attention and multi-task learning. Speech Commun. 147, 1\u201311 (2023)","journal-title":"Speech Commun."},{"issue":"9","key":"5196_CR22","doi-asserted-by":"publisher","first-page":"5239","DOI":"10.3390\/app13095239","volume":"13","author":"N Yolwas","year":"2023","unstructured":"Yolwas, N., Meng, W.: Jsum: A multitask learning speech recognition model for jointly supervised and unsupervised learning. Appl. Sci. 13(9), 5239 (2023)","journal-title":"Appl. Sci."},{"key":"5196_CR23","doi-asserted-by":"crossref","unstructured":"Chen, Z., Zhang, Y., Rosenberg, A., Ramabhadran, B., Moreno, P., Wang, G.: Tts4pretrain 2.0: Advancing the use of text and speech in asr pretraining with consistency and contrastive losses. In: ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7677\u20137681 (2022). IEEE","DOI":"10.1109\/ICASSP43922.2022.9746475"},{"issue":"11","key":"5196_CR24","first-page":"40","volume":"40","author":"M Yang","year":"2024","unstructured":"Yang, M., Jian, Z., Liang, C.: A method of synthetic spoofing speech detection using self-supervised contrastive learning. Telecommun. Sci. 40(11), 40\u201349 (2024). (Hangzhou Dianzi University)","journal-title":"Telecommun. Sci."},{"key":"5196_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Zhang, X., Sun, M., Yang, J.: A soft-contrastive pseudo learning approach towards open-world forged speech attribution. IEEE Transactions on Information Forensics and Security (2024)","DOI":"10.1109\/TIFS.2024.3515815"},{"key":"5196_CR26","doi-asserted-by":"crossref","unstructured":"Tang, H., Zhang, X., Wang, J., Cheng, N., Xiao, J.: Avqvc: One-shot voice conversion by vector quantization with applying contrastive learning. In: ICASSP 2022\u20132022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4613\u20134617 (2022). IEEE","DOI":"10.1109\/ICASSP43922.2022.9746369"},{"key":"5196_CR27","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2024.103139","volume":"165","author":"Y Xue","year":"2024","unstructured":"Xue, Y., Chen, N., Luo, Y., Zhu, H., Zhu, Z.: Clessr-vc: Contrastive learning enhanced self-supervised representations for one-shot voice conversion. Speech Commun. 165, 103139 (2024)","journal-title":"Speech Commun."},{"key":"5196_CR28","doi-asserted-by":"crossref","unstructured":"Tan, Y., Ding, X.: Split-attention cnn and self-attention with rope and gcn for voice activity detection. IEEE Access (2024)","DOI":"10.1109\/ACCESS.2024.3486003"},{"key":"5196_CR29","doi-asserted-by":"crossref","unstructured":"Kaliyev, A., Rybin, S.V., Matveev, Y.N.: Phoneme duration prediction for kazakh language. In: International Conference on Speech and Computer, pp. 274\u2013280 (2018). Springer","DOI":"10.1007\/978-3-319-99579-3_29"},{"key":"5196_CR30","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an asr corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210 (2015). IEEE","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"5196_CR31","unstructured":"Qin, Z., Zhao, W., Yu, X., Sun, X.: Openvoice: Versatile instant voice cloning (2023). arXiv preprint arXiv:2312.01479"},{"key":"5196_CR32","doi-asserted-by":"crossref","unstructured":"Kubichek, R.: Mel-cepstral distance measure for objective speech quality assessment. In: Proceedings of IEEE Pacific Rim Conference on Communications Computers and Signal Processing, vol. 1, pp. 125\u2013128 (1993). IEEE","DOI":"10.1109\/PACRIM.1993.407206"},{"issue":"1","key":"5196_CR33","first-page":"93","volume":"44","author":"W Zhu","year":"2025","unstructured":"Zhu, W., Wu, J., Jin, H., Ye, W., Zhu, Z.: Speaker recognition model based on multi-granularity spatio-temporal attention mechanism. Tech. Acoust. 44(1), 93\u2013101 (2025). (Foshan University and South China University of Technology)","journal-title":"Tech. Acoust."},{"key":"5196_CR34","doi-asserted-by":"crossref","unstructured":"Martin-Donas, J.M., Gomez, A.M., Gonzalez, J.A., Peinado, A.M.: A deep learning loss function based on the perceptual evaluation of the speech quality. IEEE Signal Process. Lett. 25(11), 1680\u20131684 (2018)","DOI":"10.1109\/LSP.2018.2871419"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-026-05196-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-026-05196-7","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-026-05196-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,22]],"date-time":"2026-03-22T22:16:10Z","timestamp":1774217770000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-026-05196-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,24]]},"references-count":34,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["5196"],"URL":"https:\/\/doi.org\/10.1007\/s11760-026-05196-7","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,24]]},"assertion":[{"value":"30 June 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 November 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 February 2026","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 February 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"This study involved human listeners for MOS tests only. All procedures were approved by the Ethics Review Committee of Shanghai Dianji University. Written informed consent was obtained from all participants, and no animal experiments were performed.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Human and animal rights"}},{"value":"The authors declare no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"123"}}