{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T17:57:10Z","timestamp":1776275830315,"version":"3.50.1"},"reference-count":77,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"name":"National Science and Technology Major Project of China","award":["2021ZD0114303"],"award-info":[{"award-number":["2021ZD0114303"]}]},{"name":"Youth Foundation Project of Zhejiang Lab","award":["111011-AA2301"],"award-info":[{"award-number":["111011-AA2301"]}]},{"name":"DFG","award":["CML 261402652"],"award-info":[{"award-number":["CML 261402652"]}]},{"name":"DFG","award":["433323019"],"award-info":[{"award-number":["433323019"]}]},{"name":"DFG","award":["402776968"],"award-info":[{"award-number":["402776968"]}]},{"name":"TRAIL MSCA Doctoral Network"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2023.3320864","type":"journal-article","created":{"date-parts":[[2023,10,2]],"date-time":"2023-10-02T18:02:57Z","timestamp":1696269777000},"page":"39-54","source":"Crossref","is-referenced-by-count":12,"title":["Disentangling Prosody Representations With Unsupervised Speech Reconstruction"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6694-5355","authenticated-orcid":false,"given":"Leyuan","family":"Qu","sequence":"first","affiliation":[{"name":"Institute of Artificial Intelligence, Zhejiang Lab, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3279-7125","authenticated-orcid":false,"given":"Taihao","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Zhejiang Lab, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5163-938X","authenticated-orcid":false,"given":"Cornelius","family":"Weber","sequence":"additional","affiliation":[{"name":"Department of Informatics, University of Hamburg, Hamburg, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2847-5097","authenticated-orcid":false,"given":"Theresa","family":"Pekarek-Rosin","sequence":"additional","affiliation":[{"name":"Department of Informatics, University of Hamburg, Hamburg, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4860-9184","authenticated-orcid":false,"given":"Fuji","family":"Ren","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1343-4775","authenticated-orcid":false,"given":"Stefan","family":"Wermter","sequence":"additional","affiliation":[{"name":"Department of Informatics, University of Hamburg, Hamburg, Germany"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2017.01.001"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-demo.1"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414880"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.769"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1080\/02699931.2010.516915"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2021.11.006"},{"key":"ref11","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"0","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref55","first-page":"1","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref54","first-page":"1171","article-title":"Scheduled sampling for sequence prediction with recurrent neural networks","author":"bengio","year":"0","journal-title":"Proc 28th Int Conf Neural Inf Process Syst"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1111\/1467-8721.00013"},{"key":"ref16","first-page":"16251","article-title":"Neural analysis and synthesis: Reconstructing speech from self-supervised representations","author":"choi","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859946"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683623"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2018.8489099"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2022.3191677"},{"key":"ref45","first-page":"577","article-title":"Attention-based models for speech recognition","author":"chorowski","year":"0","journal-title":"Proc 28th Int Conf Neural Inf Process Syst"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2017.2736999"},{"key":"ref47","article-title":"LRS3-TED: A large-scale dataset for visual speech recognition","author":"afouras","year":"2018"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2938758"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-647"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2016.2515617"},{"key":"ref8","first-page":"7836","article-title":"Unsupervised speech decomposition via triple information bottleneck","author":"qian","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref7","first-page":"5180","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","author":"wang","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP49672.2021.9362098"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1242"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1649"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-984"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICIS.2016.7550889"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-781"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054579"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3190715"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3175578"},{"key":"ref31","article-title":"A fine-tuned Wav2Vec 2.0\/HuBERT benchmark for speech emotion recognition, speaker verification and spoken language understanding","author":"wang","year":"2021"},{"key":"ref75","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","author":"radford","year":"0","journal-title":"Proc 40th Int Conf Mach Learn"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746990"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0196391"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-33"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1515\/cogl.2010.023"},{"key":"ref32","first-page":"1537","article-title":"Transformation of prosody in voice conversion","author":"?i?man","year":"0","journal-title":"Proc IEEE Asia-Pacific Signal Inf Process Assoc Annu Summit Conf"},{"key":"ref76","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"maaten","year":"2008","journal-title":"J Mach Learn Res"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1146\/annurev-devpsych-060320-102556"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1037\/10001-000"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2650"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053192"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414540"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746679"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3120586"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.312"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPAASC55919.2022.9979844"},{"key":"ref23","first-page":"4693","article-title":"Towards end-to-end prosody transfer for expressive speech synthesis with tacotron","author":"skerry-ryan","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref67","first-page":"6332","article-title":"Speechsplit2.0: Unsupervised speech disentanglement for voice conversion without tuning autoencoder bottlenecks","author":"chan","year":"0","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref26","first-page":"423","article-title":"Reusing neural speech representations for auditory emotion recognition","author":"lakomkin","year":"0","journal-title":"Proc Int Joint Conf Natural Lang Process"},{"key":"ref25","first-page":"11134","article-title":"Learning de-identified representations of prosody from raw audio","author":"weston","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3235194"},{"key":"ref20","first-page":"5210","article-title":"AutoVC: Zero-shot voice style transfer with only autoencoder loss","author":"qian","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054548"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref22","first-page":"368","article-title":"The information bottleneck method","author":"tishby","year":"0","journal-title":"Proc Annu Allerton Conf Commun Control Comput"},{"key":"ref66","first-page":"1298","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","author":"baevski","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref21","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"baevski","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref65","article-title":"DeCoAR 2.0: Deep contextualized acoustic representations with vector quantization","author":"ling","year":"2020"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-94"},{"key":"ref27","article-title":"Progressive neural networks","author":"rusu","year":"2016"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3189481"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747095"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1016\/j.vrih.2020.12.002"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2822"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/10304349\/10269014.pdf?arnumber=10269014","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T19:33:08Z","timestamp":1700508788000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10269014\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":77,"URL":"https:\/\/doi.org\/10.1109\/taslp.2023.3320864","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}