{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:30:53Z","timestamp":1777865453385,"version":"3.51.4"},"reference-count":106,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01324","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"14270-14282","source":"Crossref","is-referenced-by-count":0,"title":["AV-Flow: Transforming Text to Audio-Visual Human-Like Interactions"],"prefix":"10.1109","author":[{"given":"Aggelina","family":"Chatziagapi","sequence":"first","affiliation":[{"name":"Stony Brook University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Louis-Philippe","family":"Morency","sequence":"additional","affiliation":[{"name":"Meta AI"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongyu","family":"Gong","sequence":"additional","affiliation":[{"name":"Meta AI"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael","family":"Zollh\u00f6fer","sequence":"additional","affiliation":[{"name":"Codec Avatars Lab, Meta"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dimitris","family":"Samaras","sequence":"additional","affiliation":[{"name":"Stony Brook University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexander","family":"Richard","sequence":"additional","affiliation":[{"name":"Codec Avatars Lab, Meta"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gaussianspeech: Audio-driven gaussian avatars","author":"Aneja","year":"2024","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02009"},{"key":"ref3","first-page":"12449","article-title":"wav2vec 2.0: A framework for selfsupervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459850"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311556"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311537"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/258734.258880"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/1095878.1095881"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/FG57933.2023.10042567"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58545-7_3"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i3.32241"},{"key":"ref13","article-title":"Gaus-siantalker: Real-time high-fidelity talking head synthesis with audio-driven 3d gaussian splatting","author":"Cho","year":"2024","journal-title":"arXiv preprint"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448506"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/wacv61041.2025.00472"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00812"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925984"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1080\/09298210701653344"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1023\/A:1008166717597"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178899"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323028"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2005.843341"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.02069"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969125"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"ref28","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref29","volume-title":"The lj speech dataset","author":"Ito","year":"2017"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01150-y"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00842"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530745"},{"key":"ref33","article-title":"Loopy: Taming audiodriven portrait avatar with long-term motion dependency","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Jiang","year":"2024"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.5573\/ieie.2024.61.11.92"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/2783258.2783356"},{"key":"ref37","article-title":"Obamanet: Photo-realistic lip-sync from text","author":"Kumar","year":"2017","journal-title":"arXiv preprint"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00278"},{"key":"ref39","article-title":"Bigvgan: A universal neural vocoder with large-scale training","author":"Lee","year":"2022","journal-title":"arXiv preprint"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72684-2_8"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16286"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130813"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00338"},{"key":"ref45","article-title":"Posetalk: Text-and-audio-based pose control and motion refinement for one-shot talking head generation","author":"Ling","year":"2024","journal-title":"arXiv preprint"},{"key":"ref46","article-title":"Flow matching for generative modeling","author":"Lipman","year":"2022","journal-title":"arXiv preprint"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20066"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_7"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201401"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2025.3581808"},{"key":"ref51","article-title":"Dreamtalk: When expressive talking head generation meets diffusion probabilistic models","author":"Ma","year":"2023","journal-title":"arXiv preprint"},{"key":"ref52","first-page":"18","article-title":"librosa: Audio and music signal analysis in python","author":"Brian","year":"2015","journal-title":"SciPy"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448291"},{"key":"ref54","article-title":"Neural text to articulate talk: Deep text to audiovisual speech synthesis achieving both auditory and photo-realism","author":"Milis","year":"2023","journal-title":"arXiv preprint"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1067"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00101"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref58","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"Paszke","year":"2019","journal-title":"Advances in neural information processing systems, 32"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.287"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00241"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/wacv48630.2021.00009"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-153"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2000-469"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01077"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02074"},{"key":"ref72","article-title":"Denoising diffusion implicit models","author":"Song","year":"2020","journal-title":"arXiv preprint"},{"key":"ref73","article-title":"Scorebased generative modeling through stochastic differential equations","author":"Song","year":"2020","journal-title":"arXiv preprint"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00502"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1145\/3658221"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28313"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073699"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.2312\/SCA\/SCA12\/275-284"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01885"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73010-8_15"},{"key":"ref84","article-title":"End-to-end speech-driven realistic facial animation with temporal gans","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops","author":"Vougioukas","year":"2019"},{"key":"ref85","first-page":"2667","article-title":"Photo-realistic expressive text to talking head synthesis","author":"Wan","year":"2013","journal-title":"INTERSPEECH"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01724"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32877"},{"key":"ref88","author":"Wei","year":"2024","journal-title":"Aniportrait: Audio-driven synthesis of photorealistic portrait animations"},{"key":"ref89","article-title":"Multiface: A dataset for neural face rendering","author":"Wuu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00639"},{"key":"ref92","author":"Xu","year":"2024","journal-title":"Hallo: Hierarchical audio-driven visual synthesis for portrait image animation"},{"key":"ref93","article-title":"Vasa-1: Lifelike audio-driven talking faces generated in real time","author":"Xu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/icassp43922.2022.9747236"},{"key":"ref95","article-title":"DFA- NeRF: Personalized Talking Head Generation via Disentangled Face Attributes Neural Rendering","author":"Yao","year":"2022","journal-title":"arXiv"},{"key":"ref96","article-title":"Geneface++: Generalized and stable realtime audio-driven 3d talking face generation","author":"Ye","year":"2023","journal-title":"arXiv preprint"},{"key":"ref97","article-title":"Geneface: Generalized and high-fidelity audio-driven 3d talking face synthesis","author":"Ye","year":"2023","journal-title":"arXiv preprint"},{"key":"ref98","article-title":"Ada-tta: Towards adaptive high-quality text-to-talking avatar synthesis","author":"Ye","year":"2023","journal-title":"arXiv preprint"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747380"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1145\/3197517.3201292"},{"issue":"6","key":"ref105","doi-asserted-by":"crossref","DOI":"10.1145\/3414685.3417774","article-title":"Makeittalk: Speaker-aware talking-head animation","volume":"39","author":"Zhou","year":"2020","journal-title":"ACM Transactions on Graphics"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01016"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11446262.pdf?arnumber=11446262","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:34:47Z","timestamp":1777530887000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11446262\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":106,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01324","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}