{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:59:04Z","timestamp":1776931144036,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62576216"],"award-info":[{"award-number":["62576216"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3769748.3773363","type":"proceedings-article","created":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T10:33:15Z","timestamp":1765189995000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["FlowTalk: Real-Time Audio-Driven Talking Head Synthesis via Motion-Space Flow Matching"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1352-2952","authenticated-orcid":false,"given":"Kaijun","family":"Deng","sequence":"first","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7301-2284","authenticated-orcid":false,"given":"Yuhang","family":"Guo","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1420-0815","authenticated-orcid":false,"given":"Linlin","family":"Shen","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,8]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Zhiyuan Chen Jiajiong Cao Zhiquan Chen Yuming Li and Chenguang Ma. 2024. EchoMimic: Lifelike Audio-Driven Portrait Animations through Editable Landmark Conditioning. arxiv:https:\/\/arXiv.org\/abs\/2407.08136\u00a0[cs.CV]"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681627"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890278"},{"key":"e_1_3_3_1_5_2","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Di Donglin","year":"2025","unstructured":"Donglin Di, He Feng, Wenzhang Sun, Yongjia Ma, Hao Li, Wei Chen, Lei Fan, Tonghua Su, and Xun Yang. 2025. DH-FaceVid-1K: A Large-Scale High-Quality Dataset for Face Video Generation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_3_1_6_2","unstructured":"He Feng Yongjia Ma Donglin Di Lei Fan Tonghua Su and Xiangqian Wu. 2025. DiTalker: A Unified DiT-based Framework for High-Quality and Speaking Styles Controllable Portrait Animation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.06511 (2025)."},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00151"},{"key":"e_1_3_3_1_8_2","unstructured":"Jianzhu Guo Dingyun Zhang Xiaoqiang Liu Zhizhou Zhong Yuan Zhang Pengfei Wan and Di Zhang. 2024. LivePortrait: Efficient Portrait Animation with Stitching and Retargeting Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.03168 (2024)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"e_1_3_3_1_10_2","unstructured":"Yuhang Guo Kaijun Deng Siyang Song Jindong Xie Wenhui Ma and Linlin Shen. 2025. D\\(\\hat{\\phantom{a}}\\) 3-Talker : Dual-Branch Decoupled Deformation Fields for Few-Shot 3D Talking Head Synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.14449 (2025)."},{"key":"e_1_3_3_1_11_2","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_12_2","unstructured":"Jonathan Ho Ajay Jain and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020) 6840\u20136851."},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"crossref","unstructured":"Wei-Ning Hsu Benjamin Bolte Yao-Hung\u00a0Hubert Tsai Kushal Lakhotia Ruslan Salakhutdinov and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio speech and language processing 29 (2021) 3451\u20133460.","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_3_1_14_2","unstructured":"Jianwen Jiang Chao Liang Jiaqi Yang Gaojie Lin Tianyun Zhong and Yanbo Zheng. 2024. Loopy: Taming audio-driven portrait avatar with long-term motion dependency. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.02634 (2024)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"crossref","unstructured":"Bernhard Kerbl Georgios Kopanas Thomas Leimk\u00fchler and George Drettakis. 2023. 3d gaussian splatting for real-time radiance field rendering. ACM Transactions on Graphics 42 4 (2023) 139\u20131.","DOI":"10.1145\/3592433"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i4.28086"},{"key":"e_1_3_3_1_17_2","first-page":"127","volume-title":"European Conference on Computer Vision","author":"Li Jiahe","year":"2024","unstructured":"Jiahe Li, Jiawei Zhang, Xiao Bai, Jin Zheng, Xin Ning, Jun Zhou, and Lin Gu. 2024. Talkinggaussian: Structure-persistent 3d talking head synthesis via gaussian splatting. In European Conference on Computer Vision. 127\u2013145."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00999"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00696"},{"key":"e_1_3_3_1_20_2","unstructured":"Tianqi Li Ruobing Zheng Minghui Yang Jingdong Chen and Ming Yang. 2024. Ditto: Motion-Space Diffusion for Controllable Realtime Talking Head Synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.19509 (2024)."},{"key":"e_1_3_3_1_21_2","unstructured":"Yaron Lipman Ricky\u00a0TQ Chen Heli Ben-Hamu Maximilian Nickel and Matt Le. 2022. Flow matching for generative modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.02747 (2022)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Ben Mildenhall Pratul\u00a0P Srinivasan Matthew Tancik Jonathan\u00a0T Barron Ravi Ramamoorthi and Ren Ng. 2021. Nerf: Representing scenes as neural radiance fields for view synthesis. Commun. ACM 65 1 (2021) 99\u2013106.","DOI":"10.1145\/3503250"},{"key":"e_1_3_3_1_23_2","volume-title":"AVSP\u201998 International Conference on Auditory-Visual Speech Processing","author":"Morishima Shigeo","year":"1998","unstructured":"Shigeo Morishima. 1998. Real-time talking head driven by voice and its application to communication and entertainment. In AVSP\u201998 International Conference on Auditory-Visual Speech Processing."},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Siyang Song Zilong Shao Shashank Jaiswal Linlin Shen Michel Valstar and Hatice Gunes. 2022. Learning person-specific cognition from facial reactions for automatic personality recognition. IEEE Transactions on Affective Computing 14 4 (2022) 3048\u20133065.","DOI":"10.1109\/TAFFC.2022.3230672"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612832"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"crossref","unstructured":"Zhiyao Sun Tian Lv Sheng Ye Matthieu Lin Jenny Sheng Yu-Hui Wen Minjing Yu and Yong-jin Liu. 2024. Diffposetalk: Speech-driven stylistic 3d facial animation and head pose generation via diffusion models. ACM Transactions on Graphics (TOG) 43 4 (2024) 1\u20139.","DOI":"10.1145\/3658221"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Hiroki Tanaka Satoshi Nakamura et\u00a0al. 2022. The acceptability of virtual characters as social skills trainers: usability study. JMIR human factors 9 1 (2022) e35358.","DOI":"10.2196\/35358"},{"key":"e_1_3_3_1_29_2","first-page":"244","volume-title":"European Conference on Computer Vision","author":"Tian Linrui","year":"2024","unstructured":"Linrui Tian, Qi Wang, Bang Zhang, and Liefeng Bo. 2024. Emo: Emote portrait alive generating expressive portrait videos with audio2video diffusion model under weak conditions. In European Conference on Computer Vision. Springer, 244\u2013260."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Alex Trevithick Matthew Chan Michael Stengel Eric Chan Chao Liu Zhiding Yu Sameh Khamis Manmohan Chandraker Ravi Ramamoorthi and Koki Nagano. 2023. Real-time radiance fields for single-image portrait view synthesis. ACM Transactions on Graphics (TOG) 42 4 (2023) 1\u201315.","DOI":"10.1145\/3592460"},{"key":"e_1_3_3_1_31_2","unstructured":"Thomas Unterthiner Sjoerd van Steenkiste Karol Kurach Rapha\u00ebl Marinier Marcin Michalski and Sylvain Gelly. 2019. FVD: A new Metric for Video Generation. https:\/\/openreview.net\/forum?id=rylgEULtdN"},{"key":"e_1_3_3_1_32_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3746027.3755285"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00855"},{"key":"e_1_3_3_1_35_2","unstructured":"Mingwang Xu Hui Li Qingkun Su Hanlin Shang Liwei Zhang Ce Liu Jingdong Wang Yao Yao and Siyu Zhu. 2024. Hallo: Hierarchical audio-driven visual synthesis for portrait image animation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.08801 (2024)."},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"crossref","unstructured":"Sicheng Xu Guojun Chen Yu-Xiao Guo Jiaolong Yang Chong Li Zhenyu Zang Yizhong Zhang Xin Tong and Baining Guo. 2024. Vasa-1: Lifelike audio-driven talking faces generated in real time. Advances in Neural Information Processing Systems 37 (2024) 660\u2013684.","DOI":"10.52202\/079017-0021"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_6"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25464"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"crossref","unstructured":"Yang Zhou Xintong Han Eli Shechtman Jose Echevarria Evangelos Kalogerakis and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions On Graphics (TOG) 39 6 (2020) 1\u201315.","DOI":"10.1145\/3414685.3417774"}],"event":{"name":"MMAsia '25 Workshops: ACM Multimedia Asia Workshops","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25 Workshops","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3769748.3773363","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T22:04:09Z","timestamp":1769205849000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3769748.3773363"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,8]]},"references-count":40,"alternative-id":["10.1145\/3769748.3773363","10.1145\/3769748"],"URL":"https:\/\/doi.org\/10.1145\/3769748.3773363","relation":{},"subject":[],"published":{"date-parts":[[2025,12,8]]},"assertion":[{"value":"2025-12-08","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}