{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T17:30:32Z","timestamp":1769535032185,"version":"3.49.0"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/icme59968.2025.11209157","type":"proceedings-article","created":{"date-parts":[[2025,10,30]],"date-time":"2025-10-30T17:57:42Z","timestamp":1761847062000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["A Progressive Generation Framework with Speech Pre-trained Model for Expressive Voice Conversion"],"prefix":"10.1109","author":[{"given":"Tianrui","family":"Wang","sequence":"first","affiliation":[{"name":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Meng","family":"Ge","sequence":"additional","affiliation":[{"name":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhikang","family":"Niu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng","family":"Gong","sequence":"additional","affiliation":[{"name":"China Telecom,Institute of Artificial Intelligence,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chunyu","family":"Qiang","sequence":"additional","affiliation":[{"name":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haoyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zikang","family":"Huang","sequence":"additional","affiliation":[{"name":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ziyang","family":"Ma","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaobao","family":"Wang","sequence":"additional","affiliation":[{"name":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xie","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University,MoE Key Lab of Artificial Intelligence, AI Institute,Shanghai,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Longbiao","family":"Wang","sequence":"additional","affiliation":[{"name":"Tianjin University,Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing,Tianjin,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianwu","family":"Dang","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Shenzhen Institute of Advanced Technology,Guangdong,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3434425"},{"key":"ref3","first-page":"7836","article-title":"Unsupervised speech decomposition via triple information bottleneck","volume":"119","author":"Qian","journal-title":"Proceedings of Machine Learning Research"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9687906"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10249"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389651"},{"key":"ref7","first-page":"8650","article-title":"Global prosody style transfer without text transcriptions","author":"Qian","year":"2021","journal-title":"ICML"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2023.3250266"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00916"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-283"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447984"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2663"},{"key":"ref14","article-title":"Naturalspeech 3: Zero-shot speech synthesis with factorized codec and diffusion models","volume-title":"Forty-first International Conference on Machine Learning","author":"Ju"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613800"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/odyssey.2024-25"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/taslpro.2025.3564118"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i4.25594"},{"key":"ref19","first-page":"5210","article-title":"AUTOVC: Zero-shot voice style transfer with only autoencoder loss","author":"Qian","year":"2019","journal-title":"ICML"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054734"},{"key":"ref21","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"NeurIPS","volume":"33","author":"Baevski"},{"key":"ref22","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2022"},{"key":"ref23","article-title":"Iqdubbing: Prosody modeling based on discrete self-supervised speech representation for expressive voice conversion","author":"Gan","year":"2022"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096057"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3461615.3491106"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/SpeechProsody.2004-1"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1038\/nrn2113"},{"key":"ref28","article-title":"BigVGAN: A universal neural vocoder with large-scale training","author":"Lee","year":"2022"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.580"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1446"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-1566"},{"key":"ref33","article-title":"CosyVoice: A scalable multilingual zero-shot text-to-speech synthesizer based on supervised semantic tokens","author":"Du","year":"2024"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448291"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-788"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref37","first-page":"1298","article-title":"data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"Proc. ICML","author":"Baevski"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.931"}],"event":{"name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","location":"Nantes, France","start":{"date-parts":[[2025,6,30]]},"end":{"date-parts":[[2025,7,4]]}},"container-title":["2025 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11208895\/11208897\/11209157.pdf?arnumber=11209157","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,27]],"date-time":"2026-01-27T04:46:24Z","timestamp":1769489184000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11209157\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/icme59968.2025.11209157","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}