{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T05:24:07Z","timestamp":1755926647462,"version":"3.37.3"},"reference-count":49,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,13]],"date-time":"2021-12-13T00:00:00Z","timestamp":1639353600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100004807","name":"DFG","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100004807","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12,13]]},"DOI":"10.1109\/asru51503.2021.9687866","type":"proceedings-article","created":{"date-parts":[[2022,2,3]],"date-time":"2022-02-03T20:31:00Z","timestamp":1643920260000},"page":"757-764","source":"Crossref","is-referenced-by-count":2,"title":["Hearing Faces: Target Speaker Text-to-Speech Synthesis from a Face"],"prefix":"10.1109","author":[{"given":"Bjorn","family":"Pluster","sequence":"first","affiliation":[{"name":"Knowledge Technology, University of Hamburg,Department of Informatics"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cornelius","family":"Weber","sequence":"additional","affiliation":[{"name":"Knowledge Technology, University of Hamburg,Department of Informatics"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Leyuan","family":"Qu","sequence":"additional","affiliation":[{"name":"Knowledge Technology, University of Hamburg,Department of Informatics"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Stefan","family":"Wermter","sequence":"additional","affiliation":[{"name":"Knowledge Technology, University of Hamburg,Department of Informatics"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","first-page":"577","article-title":"Attention-based models for speech recognition","volume":"2015 janua","author":"chorowski","year":"0","journal-title":"Advances in neural information processing systems"},{"key":"ref38","first-page":"5999","article-title":"Attention is all you need","volume":"2017 decem","author":"vaswani","year":"0","journal-title":"Advances in neural information processing systems"},{"key":"ref33","article-title":"WaveNet: A Generative Model for Raw Audio","author":"van den oord","year":"2016","journal-title":"ArXiv Preprint"},{"journal-title":"CSTR VCTK Corpus English Multi-Speaker Corpus for CSTR Voice Cloning Toolkit (version 0 92)","year":"2019","author":"yamagishi","key":"ref32"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2015EDP7457"},{"key":"ref30","doi-asserted-by":"crossref","first-page":"1039","DOI":"10.1016\/j.specom.2009.04.004","article-title":"Statistical Parametric Speech Synthesis","volume":"51","author":"zen","year":"2009","journal-title":"Speech Communication"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref36","first-page":"448","article-title":"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift","volume":"1","author":"ioffe","year":"0","journal-title":"32nd International Conference on Machine Learning"},{"journal-title":"G2pe","year":"2019","author":"kyubyong","key":"ref35"},{"journal-title":"Parallel WaveGAN implementation with Pytorch","year":"2020","author":"hayashi","key":"ref34"},{"key":"ref28","first-page":"1416","article-title":"Multimodal Target Speech Separation with Voice and Face References","volume":"2020 octob","author":"weber","year":"0","journal-title":"Proceedings of the Annual Conference of the International Speech Communication Association"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201357"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2136"},{"key":"ref2","first-page":"2963","article-title":"Deep Voice 2: Multi-speaker neural text-to-speech","volume":"2017 decem","author":"arik","year":"0","journal-title":"Advances in neural information processing systems"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"ref20","article-title":"MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis","volume":"32","author":"kumar","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref22","article-title":"LRS3-TED: a large-scale dataset for visual speech recognition","author":"afouras","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00772"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682970"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1400"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683872"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20873-8_18"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"journal-title":"FaceNet-Pytorch","year":"2020","author":"esler","key":"ref13"},{"key":"ref14","first-page":"4278","article-title":"Inception-v4, Inception-ResN et and the Impact of Residual Connections on Learning","author":"szegedy","year":"0","journal-title":"31st AAAI Conference on Artificial Intelligence"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00323"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ASYU50717.2020.9259802"},{"journal-title":"ESPNet Model Zoo","year":"2020","author":"hayashi","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016706"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref4","first-page":"8229","article-title":"Style tokens: Unsupervised style modeling, control and transfer in end-to-end speech synthesis","volume":"12","author":"wang","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref3","first-page":"1","article-title":"FastSpeech 2: Fast and High-Quality End-to-End Text to Speech","author":"ren","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1207\/s15326969eco1603_1"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.cub.2003.09.005"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.3758\/s13414-015-1045-8"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1037\/a0030945"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00879"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1617"},{"key":"ref45","first-page":"9633","article-title":"Listening to Sounds of Silence for Speech Denoising","volume":"33","author":"xu","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1130"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1955"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00020"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1924"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21105\/joss.00861"}],"event":{"name":"2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","start":{"date-parts":[[2021,12,13]]},"location":"Cartagena, Colombia","end":{"date-parts":[[2021,12,17]]}},"container-title":["2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9687821\/9687855\/09687866.pdf?arnumber=9687866","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,16]],"date-time":"2022-05-16T20:42:05Z","timestamp":1652733725000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9687866\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,13]]},"references-count":49,"URL":"https:\/\/doi.org\/10.1109\/asru51503.2021.9687866","relation":{},"subject":[],"published":{"date-parts":[[2021,12,13]]}}}