{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:10:54Z","timestamp":1772907054511,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681627","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"10985-10994","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":39,"title":["GaussianTalker: Real-Time Talking Head Synthesis with 3D Gaussian Splatting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0177-0694","authenticated-orcid":false,"given":"Kyusun","family":"Cho","sequence":"first","affiliation":[{"name":"Korea University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1707-9727","authenticated-orcid":false,"given":"Joungbin","family":"Lee","sequence":"additional","affiliation":[{"name":"Korea University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9238-321X","authenticated-orcid":false,"given":"Heeji","family":"Yoon","sequence":"additional","affiliation":[{"name":"Korea University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2060-2207","authenticated-orcid":false,"given":"Yeobin","family":"Hong","sequence":"additional","affiliation":[{"name":"Korea University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5159-6237","authenticated-orcid":false,"given":"Jaehoon","family":"Ko","sequence":"additional","affiliation":[{"name":"Korea University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8074-4212","authenticated-orcid":false,"given":"Sangjun","family":"Ahn","sequence":"additional","affiliation":[{"name":"NC Research, Seongnam-si, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2927-6273","authenticated-orcid":false,"given":"Seungryong","family":"Kim","sequence":"additional","affiliation":[{"name":"Korea Advanced Institute of Science and Technology, Seoul, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.116"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00021"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01565"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"e_1_3_2_2_5_1","volume-title":"Monogaussianavatar: Monocular gaussian point-based head avatar. arXiv preprint arXiv:2312.04558","author":"Chen Yufan","year":"2023","unstructured":"Yufan Chen, LizhenWang, Qijing Li, Hongjiang Xiao, Shengping Zhang, Hongxun Yao, and Yebin Liu. 2023. Monogaussianavatar: Monocular gaussian point-based head avatar. arXiv preprint arXiv:2312.04558 (2023)."},{"key":"e_1_3_2_2_6_1","volume-title":"Headgas: Real-time animatable head avatars via 3d gaussian splatting. arXiv preprint arXiv:2312.02902","author":"Dhamo Helisa","year":"2023","unstructured":"Helisa Dhamo, Yinyu Nie, Arthur Moreau, Jifei Song, Richard Shaw, Yiren Zhou, and Eduardo P\u00e9rez-Pellitero. 2023. Headgas: Real-time animatable head avatars via 3d gaussian splatting. arXiv preprint arXiv:2312.02902 (2023)."},{"key":"e_1_3_2_2_7_1","volume-title":"Friesen","author":"Ekman Paul","year":"1978","unstructured":"Paul Ekman and Wallace V. Friesen. 1978. Facial Action Coding System: Manual. Palo Alto: Consulting Psychologists Press."},{"key":"e_1_3_2_2_8_1","volume-title":"K-Planes: Explicit Radiance Fields in Space, Time, and Appearance. arXiv preprint arXiv:2301.10241","author":"Fridovich-Keil Sara","year":"2023","unstructured":"Sara Fridovich-Keil, Giacomo Meanti, Frederik Warburg, Benjamin Recht, and Angjoo Kanazawa. 2023. K-Planes: Explicit Radiance Fields in Space, Time, and Appearance. arXiv preprint arXiv:2301.10241 (2023)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"e_1_3_2_2_10_1","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. In NeurIPS."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01930"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Yuge Huang Yuhan Wang Ying Tai Xiaoming Liu Pengcheng Shen Shaoxin Li Jilin Li and Feiyue Huang. 2020. CurricularFace: Adaptive Curriculum Learning Loss for Deep Face Recognition. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00594"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01150-y"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"e_1_3_2_2_15_1","volume-title":"Efficient Region- Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis. arXiv preprint arXiv:2307.09323","author":"Li Jiahe","year":"2023","unstructured":"Jiahe Li, Jiawei Zhang, Xiao Bai, Jun Zhou, and Lin Gu. 2023. Efficient Region- Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis. arXiv preprint arXiv:2307.09323 (2023)."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130813"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01864"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_7"},{"key":"e_1_3_2_2_19_1","volume-title":"Animatable 3D Gaussian: Fast and High-Quality Reconstruction of Multiple Human Avatars. arXiv preprint arXiv:2311.16482","author":"Liu Yang","year":"2023","unstructured":"Yang Liu, Xiang Huang, Minghan Qin, Qinwei Lin, and Haoqian Wang. 2023. Animatable 3D Gaussian: Fast and High-Quality Reconstruction of Multiple Human Avatars. arXiv preprint arXiv:2311.16482 (2023)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480484"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Jonathon Luiten Georgios Kopanas Bastian Leibe and Deva Ramanan. 2023. Dynamic 3D Gaussians: Tracking by Persistent Dynamic View Synthesis. arXiv:2308.09713 [cs.CV]","DOI":"10.1109\/3DV62453.2024.00044"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530127"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_2_25_1","volume-title":"Gaussianavatars: Photorealistic head avatars with rigged 3d gaussians. arXiv preprint arXiv:2312.02069","author":"Qian Shenhan","year":"2023","unstructured":"Shenhan Qian, Tobias Kirschstein, Liam Schoneveld, Davide Davoli, Simon Giebenhain, and Matthias Nie\u00dfner. 2023. Gaussianavatars: Photorealistic head avatars with rigged 3d gaussians. arXiv preprint arXiv:2312.02069 (2023)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_39"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2022.3146783"},{"key":"e_1_3_2_2_28_1","first-page":"4","article-title":"Speech2Talking- Face: Inferring and Driving a Face with Synchronized Audio-Visual Representation","volume":"2","author":"Sun Yasheng","year":"2021","unstructured":"Yasheng Sun, Hang Zhou, Ziwei Liu, and Hideki Koike. 2021. Speech2Talking- Face: Inferring and Driving a Face with Synchronized Audio-Visual Representation.. In IJCAI, Vol. 2. 4.","journal-title":"IJCAI"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_2_2_30_1","volume-title":"Real-time Neural Radiance Talking Portrait Synthesis via Audio-spatial Decomposition. arXiv preprint arXiv:2211.12368","author":"Tang Jiaxiang","year":"2022","unstructured":"Jiaxiang Tang, Kaisiyuan Wang, Hang Zhou, Xiaokang Chen, Dongliang He, Tianshu Hu, Jingtuo Liu, Gang Zeng, and JingdongWang. 2022. Real-time Neural Radiance Talking Portrait Synthesis via Audio-spatial Decomposition. arXiv preprint arXiv:2211.12368 (2022)."},{"key":"e_1_3_2_2_31_1","volume-title":"Neural Voice Puppetry: Audio-Driven Facial Reenactment. In European conference on computer vision. Springer, 716--731","author":"Thies Justus","year":"2020","unstructured":"Justus Thies, Mohamed Elgharib, Ayush Tewari, Christian Theobalt, and Matthias Nie\u00dfner. 2020. Neural Voice Puppetry: Audio-Driven Facial Reenactment. In European conference on computer vision. Springer, 716--731."},{"key":"e_1_3_2_2_32_1","unstructured":"Jie Wang Jiu-Cheng Xie Xianyan Li Feng Xu Chi-Man Pun and Hao Gao. 2024. GaussianHead: High-fidelity Head Avatars with Learnable Gaussian Derivation. arXiv:2312.01632 [cs.CV]"},{"key":"e_1_3_2_2_33_1","volume-title":"MEAD: A Large-Scale Audio- Visual Dataset for Emotional Talking-Face Generation. In European conference on computer vision. Springer, 700--717","author":"Wang Kaisiyuan","year":"2020","unstructured":"Kaisiyuan Wang, Qianyi Wu, Linsen Song, Zhuoqian Yang, Wayne Wu, Chen Qian, Ran He, Yu Qiao, and Chen Change Loy. 2020. MEAD: A Large-Scale Audio- Visual Dataset for Emotional Talking-Face Generation. In European conference on computer vision. Springer, 700--717."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"e_1_3_2_2_35_1","unstructured":"Guanjun Wu Taoran Yi Jiemin Fang Lingxi Xie Xiaopeng Zhang Wei Wei Wenyu Liu Qi Tian and Xinggang Wang. 2023. 4D Gaussian Splatting for Real-Time Dynamic Scene Rendering. arXiv:2310.08528 [cs.CV]"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"Ziyi Yang Xinyu Gao Wen Zhou Shaohui Jiao Yuqing Zhang and Xiaogang Jin. 2023. Deformable 3D Gaussians for High-Fidelity Monocular Dynamic Scene Reconstruction. arXiv:2309.13101 [cs.CV]","DOI":"10.1109\/CVPR52733.2024.01922"},{"key":"e_1_3_2_2_37_1","unstructured":"Zeyu Yang Hongye Yang Zijie Pan Xiatian Zhu and Li Zhang. 2023. Real-time Photorealistic Dynamic Scene Representation and Rendering with 4D Gaussian Splatting. arXiv:2310.10642 [cs.CV]"},{"key":"e_1_3_2_2_38_1","volume-title":"DFA-NeRF: Personalized Talking Head Generation via Disentangled Face Attributes Neural Rendering. arXiv preprint arXiv:2201.00791","author":"Yao Shunyu","year":"2022","unstructured":"Shunyu Yao, RuiZhe Zhong, Yichao Yan, Guangtao Zhai, and Xiaokang Yang. 2022. DFA-NeRF: Personalized Talking Head Generation via Disentangled Face Attributes Neural Rendering. arXiv preprint arXiv:2201.00791 (2022)."},{"key":"e_1_3_2_2_39_1","volume-title":"GeneFace: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation. arXiv preprint arXiv:2305.00787","author":"Ye Zhenhui","year":"2023","unstructured":"Zhenhui Ye, Jinzheng He, Ziyue Jiang, Rongjie Huang, Jiawei Huang, Jinglin Liu, Yi Ren, Xiang Yin, Zejun Ma, and Zhou Zhao. 2023. GeneFace: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation. arXiv preprint arXiv:2305.00787 (2023)."},{"key":"e_1_3_2_2_40_1","volume-title":"The Eleventh International Conference on Learning Representations.","author":"Ye Zhenhui","year":"2022","unstructured":"Zhenhui Ye, Ziyue Jiang, Yi Ren, Jinglin Liu, Jinzheng He, and Zhou Zhao. 2022. GeneFace: Generalized and High-Fidelity Audio-Driven 3D Talking Face Synthesis. In The Eleventh International Conference on Learning Representations."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_6"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.2973374"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3414685.3417774","article-title":"Makelttalk: speaker-aware talking-head animation","volume":"39","author":"Zhou Yang","year":"2020","unstructured":"Yang Zhou, Xintong Han, Eli Shechtman, Jose Echevarria, Evangelos Kalogerakis, and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions On Graphics (TOG) 39, 6 (2020), 1--15.","journal-title":"ACM Transactions On Graphics (TOG)"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681627","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681627","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681627"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":47,"alternative-id":["10.1145\/3664647.3681627","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681627","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}