{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,28]],"date-time":"2026-05-28T06:01:52Z","timestamp":1779948112552,"version":"3.53.1"},"reference-count":100,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T00:00:00Z","timestamp":1773964800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T00:00:00Z","timestamp":1773964800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,20]]},"DOI":"10.1109\/3dv69130.2026.00073","type":"proceedings-article","created":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T19:40:49Z","timestamp":1779910849000},"page":"704-716","source":"Crossref","is-referenced-by-count":0,"title":["Supervising 3D Talking Head Avatars with Analysis-by-Audio-Synthesis"],"prefix":"10.1109","author":[{"given":"Radek","family":"Dan\u010de\u011bcek","sequence":"first","affiliation":[{"name":"Max Planck Institute for Intelligent Systems,T&#x0252;bingen,Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Carolin","family":"Schmitt","sequence":"additional","affiliation":[{"name":"Max Planck Institute for Intelligent Systems,T&#x0252;bingen,Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Senya","family":"Polikovsky","sequence":"additional","affiliation":[{"name":"Max Planck Institute for Intelligent Systems,T&#x0252;bingen,Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Michael J.","family":"Black","sequence":"additional","affiliation":[{"name":"Max Planck Institute for Intelligent Systems,T&#x0252;bingen,Germany"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","article-title":"LRS3-TED: a large-scale dataset for visual speech recognition","author":"Afouras","year":"2018","journal-title":"CoRR"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02009"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01214"},{"key":"ref4","first-page":"33:12449","article-title":"wav2vec 2.0: A framework for selfsupervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/AFGR.2002.1004155"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1111\/1467-8659.t01-1-00712"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/1095878.1095881"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01962"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00190"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00718"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-194"},{"key":"ref13","first-page":"200","article-title":"Animated speech: research progress and applications","volume-title":"Auditory-Visual Speech Processing, AVSP 2001","author":"Cohen"},{"key":"ref14","author":"Cooke","year":"2006","journal-title":"The grid audio-visual speech corpus"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-139"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01967"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618183"},{"key":"ref19","volume-title":"INFERNO: Set the world on fire with FLAME","author":"Dan\u011b\u010dek","year":"2023"},{"key":"ref20","first-page":"1836","article-title":"SVTS: scalable video-to-speech synthesis","volume-title":"Annual Conference of the International Speech Communication Association, Interspeech 2022","author":"de Mira","year":"2022"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925984"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3388767.3407339"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2024-1595"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72940-9_12"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2010.2052239"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459936"},{"key":"ref29","author":"Panagiotis","year":"2022","journal-title":"Visual speech-aware perceptual 3d facial expression reconstruction from videos"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i2.27945"},{"key":"ref31","author":"Giebenhain","year":"2025","journal-title":"Pixel3dmm: Versatile screen-space priors for single-image 3d face reconstruction"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3614157"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2407694"},{"key":"ref35","article-title":"Classifier-free diffusion guidance","author":"Ho","year":"2022","journal-title":"CoRR"},{"key":"ref36","article-title":"Denoising diffusion probabilistic models","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020","author":"Ho","year":"2020"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3126925"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref39","volume-title":"Real-time voice cloning","author":"Jemine","year":"2023"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2585878"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"ref42","article-title":"Parallel-datafree voice conversion using cycle-consistent adversarial networks","author":"Kaneko","year":"2017","journal-title":"CoRR"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682897"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2280"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01046"},{"key":"ref47","first-page":"2758","article-title":"Lip to speech synthesis with visual context attentional GAN","volume-title":"Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021","author":"Kim","year":"2021"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095582"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.3269"},{"key":"ref50","author":"Ladefoged","year":"2011","journal-title":"A course in phonetics (sixth edition)"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130813"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1016\/j.patter.2022.100616"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0196391"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-025-1562-4"},{"key":"ref55","article-title":"Mediapipe: A framework for building perception pipelines","author":"Lugaresi","year":"2019","journal-title":"CoRR"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1026"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73397-0_2"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00283"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO54536.2021.9616266"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611734"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.287"},{"key":"ref63","article-title":"End-to-end learning for 3d facial animation from raw waveforms of speech","author":"Xuan Pham","year":"2017","journal-title":"CoRR"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"ref65","article-title":"Train short, test long: Attention with linear biases enables input length extrapolation","volume-title":"The Tenth International Conference on Learning Representations, ICLR 2022, Virtual Event","author":"Press","year":"2022"},{"key":"ref66","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021","author":"Radford","year":"2021"},{"key":"ref67","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"International Conference on Machine Learning, ICML 2023","author":"Radford","year":"2023"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00241"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"ref70","first-page":"749","article-title":"Perceptual evaluation of speech quality (pesq)-a new method for speech quality assessment of telephone networks and codecs","volume-title":"IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2001","author":"Antony","year":"2001"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00795"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3623264.3624447"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1145\/3658221"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1794"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2011.2114881"},{"key":"ref77","first-page":"275","article-title":"Dynamic units of visual speech","volume-title":"Proceedings of the 2012 Eurographics\/ACM SIGGRAPH Symposium on Computer Animation, SCA 2012","author":"Taylor"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073699"},{"key":"ref79","article-title":"Human motion diffusion model","volume-title":"The Eleventh International Conference on Learning Representations,","author":"Tevet","year":"2023"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.401"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00270"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01107"},{"key":"ref83","author":"Thambiraja","year":"2023","journal-title":"3diface: Diffusion-based speechdriven 3d facial animation and editing"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01885"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO58844.2023.10290115"},{"key":"ref86","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1007\/BFb0054761"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1445"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"ref90","author":"Wuu","year":"2022","journal-title":"Multiface: A dataset for neural face rendering"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1145\/2522628.2522904"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72992-8_14"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414040"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02577"},{"key":"ref96","author":"Yemini","year":"2024","journal-title":"Lipvoicer: Generating speech from silent videos guided by lip reading"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00829"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657413"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201292"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19778-9_15"}],"event":{"name":"2026 International Conference on 3D Vision (3DV)","location":"Vancouver, BC, Canada","start":{"date-parts":[[2026,3,20]]},"end":{"date-parts":[[2026,3,23]]}},"container-title":["2026 International Conference on 3D Vision (3DV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11533157\/11533158\/11533305.pdf?arnumber=11533305","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,28]],"date-time":"2026-05-28T05:02:46Z","timestamp":1779944566000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11533305\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,20]]},"references-count":100,"URL":"https:\/\/doi.org\/10.1109\/3dv69130.2026.00073","relation":{},"subject":[],"published":{"date-parts":[[2026,3,20]]}}}