{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:59:04Z","timestamp":1776931144087,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3769748.3773362","type":"proceedings-article","created":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T10:33:15Z","timestamp":1765189995000},"page":"1-3","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["AKITalk: Audio-Implicit Keypoints for Identity-Preserving Talking-Head Video Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1126-0498","authenticated-orcid":false,"given":"Riku","family":"Takahashi","sequence":"first","affiliation":[{"name":"Hosei University, Tokyo, Japan and EQUES Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0569-3619","authenticated-orcid":false,"given":"Rongzhi","family":"Li","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan and EQUES Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6016-3866","authenticated-orcid":false,"given":"Yuta","family":"Oshima","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan and EQUES Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2911-8730","authenticated-orcid":false,"given":"Sho","family":"Kuno","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan and EQUES Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6324-9291","authenticated-orcid":false,"given":"Ryugo","family":"Morita","sequence":"additional","affiliation":[{"name":"EQUES Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8516-9707","authenticated-orcid":false,"given":"Issey","family":"Sukeda","sequence":"additional","affiliation":[{"name":"EQUES Inc., Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,8]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i3.32241"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555399"},{"key":"e_1_3_3_2_4_2","first-page":"251","volume-title":"Asian conference on computer vision","author":"Chung Joon\u00a0Son","year":"2016","unstructured":"Joon\u00a0Son Chung and Andrew Zisserman. 2016. Out of time: automated lip sync in the wild. In Asian conference on computer vision. Springer, 251\u2013263."},{"key":"e_1_3_3_2_5_2","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Di Donglin","year":"2025","unstructured":"Donglin Di, He Feng, Wenzhang Sun, Yongjia Ma, Hao Li, Wei Chen, Lei Fan, Tonghua Su, and Xun Yang. 2025. DH-FaceVid-1K: A Large-Scale High-Quality Dataset for Face Video Generation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_3_2_6_2","unstructured":"Jianzhu Guo Dingyun Zhang Xiaoqiang Liu Zhizhou Zhong Yuan Zhang Pengfei Wan and Di Zhang. 2024. Liveportrait: Efficient portrait animation with stitching and retargeting control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.03168 (2024)."},{"key":"e_1_3_3_2_7_2","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_3_2_10_2","first-page":"5210","volume-title":"International Conference on Machine Learning","author":"Qian Kaizhi","year":"2019","unstructured":"Kaizhi Qian, Yang Zhang, Shiyu Chang, Xuesong Yang, and Mark Hasegawa-Johnson. 2019. Autovc: Zero-shot voice style transfer with only autoencoder loss. In International Conference on Machine Learning. PMLR, 5210\u20135219."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_12_2","unstructured":"Bowen Shi Wei-Ning Hsu Kushal Lakhotia and Abdelrahman Mohamed. 2022. Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2201.02184 (2022)."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Supasorn Suwajanakorn Steven\u00a0M Seitz and Ira Kemelmacher-Shlizerman. 2017. Synthesizing obama: learning lip sync from audio. ACM Transactions on Graphics (ToG) 36 4 (2017) 1\u201313.","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_3_2_14_2","first-page":"398","volume-title":"European Conference on Computer Vision","author":"Tan Shuai","year":"2024","unstructured":"Shuai Tan, Bin Ji, Mengxiao Bi, and Ye Pan. 2024. Edtalk: Efficient disentanglement for emotional talking head synthesis. In European Conference on Computer Vision. Springer, 398\u2013416."},{"key":"e_1_3_3_2_15_2","unstructured":"Thomas Unterthiner Sjoerd Van\u00a0Steenkiste Karol Kurach Rapha\u00ebl Marinier Marcin Michalski and Sylvain Gelly. 2019. FVD: A new metric for video generation. (2019)."},{"key":"e_1_3_3_2_16_2","unstructured":"Mingwang Xu Hui Li Qingkun Su Hanlin Shang Liwei Zhang Ce Liu Jingdong Wang Yao Yao and Siyu Zhu. 2024. Hallo: Hierarchical audio-driven visual synthesis for portrait image animation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.08801 (2024)."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00607"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Yang Zhou Xintong Han Eli Shechtman Jose Echevarria Evangelos Kalogerakis and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions On Graphics (TOG) 39 6 (2020) 1\u201315.","DOI":"10.1145\/3414685.3417774"}],"event":{"name":"MMAsia '25 Workshops: ACM Multimedia Asia Workshops","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25 Workshops","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3769748.3773362","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T22:04:13Z","timestamp":1769205853000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3769748.3773362"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,8]]},"references-count":19,"alternative-id":["10.1145\/3769748.3773362","10.1145\/3769748"],"URL":"https:\/\/doi.org\/10.1145\/3769748.3773362","relation":{},"subject":[],"published":{"date-parts":[[2025,12,8]]},"assertion":[{"value":"2025-12-08","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}