{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T01:13:30Z","timestamp":1740100410402,"version":"3.37.3"},"reference-count":24,"publisher":"IEEE","funder":[{"DOI":"10.13039\/501100006730","name":"Romanian Ministry of Education and Research","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006730","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100005186","name":"CNCS - UEFISCDI","doi-asserted-by":"publisher","award":["PN-III-P1-1.1-PD-2019-0918"],"award-info":[{"award-number":["PN-III-P1-1.1-PD-2019-0918"]}],"id":[{"id":"10.13039\/100005186","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,8,23]]},"DOI":"10.23919\/eusipco54536.2021.9616266","type":"proceedings-article","created":{"date-parts":[[2021,12,8]],"date-time":"2021-12-08T21:55:53Z","timestamp":1639000553000},"page":"46-50","source":"Crossref","is-referenced-by-count":6,"title":["Speaker disentanglement in video-to-speech conversion"],"prefix":"10.23919","author":[{"given":"Dan","family":"Oneata","sequence":"first","affiliation":[]},{"given":"Adriana","family":"Stan","sequence":"additional","affiliation":[]},{"given":"Horia","family":"Cucu","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2015.03.005"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-139"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.330110023"},{"key":"ref13","first-page":"13796","article-title":"Learning individual speaking styles for accurate lip to speech synthesis","author":"kr","year":"0","journal-title":"IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1943"},{"key":"ref15","first-page":"4779","article-title":"Natural TTS synthesis by conditioning WaveNet on Mel spectrogram predictions","author":"shen","year":"0","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref17","article-title":"Ef-ficiently trainable text-to-speech system based on deep convolutional networks with guided attention","volume":"abs 1710 8969","author":"tachibana","year":"2017","journal-title":"CoRR"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1983.1172092"},{"key":"ref19","first-page":"5791","article-title":"Utterance-level aggregation for speaker recognition in the wild","author":"xie","year":"0","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref4","article-title":"Exemplar-based lip-to-speech synthesis using convolutional neural networks","author":"takashima","year":"0","journal-title":"Workshop on Frontiers of Computer Vision"},{"key":"ref3","first-page":"2516","article-title":"Lip2AudSpec: Speech reconstruction from silent lip movements video","author":"akbari","year":"0","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref6","first-page":"2962","article-title":"Deep voice 2: Multi-speaker neural text-to-speech","author":"gibiansky","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1445"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00222"},{"key":"ref7","first-page":"4480","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","author":"jia","year":"2018","journal-title":"Advances in neural information processing systems"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3241911"},{"key":"ref9","first-page":"5901","article-title":"Disentangling correlated speaker and noise for speech synthesis via data augmentation and adversarial factorization","author":"hsu","year":"0","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref1","first-page":"5095","article-title":"Vid2speech: Speech reconstruction from silent video","author":"ephrat","year":"0","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref20","first-page":"1180","article-title":"Unsupervised domain adaptation by backpropagation","author":"ganin","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-647"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"ref24","article-title":"Maximizing mutual information for Tacotron","author":"liu","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref23","first-page":"3935","article-title":"Enhancing the TED-LIUM corpus with selected data for language modeling and more TED talks","author":"rousseau","year":"2014","journal-title":"LREC"}],"event":{"name":"2021 29th European Signal Processing Conference (EUSIPCO)","start":{"date-parts":[[2021,8,23]]},"location":"Dublin, Ireland","end":{"date-parts":[[2021,8,27]]}},"container-title":["2021 29th European Signal Processing Conference (EUSIPCO)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9615915\/9615917\/09616266.pdf?arnumber=9616266","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,2,15]],"date-time":"2022-02-15T06:44:13Z","timestamp":1644907453000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9616266\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,8,23]]},"references-count":24,"URL":"https:\/\/doi.org\/10.23919\/eusipco54536.2021.9616266","relation":{},"subject":[],"published":{"date-parts":[[2021,8,23]]}}}