{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:03:32Z","timestamp":1777655012767,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.61831022"],"award-info":[{"award-number":["No.61831022"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key R&D Program of China","award":["2020AAA0108600"],"award-info":[{"award-number":["2020AAA0108600"]}]},{"name":"Beijing Academy of Artificial Intelligence","award":["No. BAAI2019QN0302"],"award-info":[{"award-number":["No. BAAI2019QN0302"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475280","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T17:45:27Z","timestamp":1634579127000},"page":"1478-1486","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":52,"title":["Imitating Arbitrary Talking Style for Realistic Audio-Driven Talking Face Synthesis"],"prefix":"10.1145","author":[{"given":"Haozhe","family":"Wu","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jia","family":"Jia","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haoyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yishun","family":"Dou","sequence":"additional","affiliation":[{"name":"HiSilicon Company, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chao","family":"Duan","sequence":"additional","affiliation":[{"name":"HiSilicon Company, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qingshan","family":"Deng","sequence":"additional","affiliation":[{"name":"HiSilicon Company, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"T. Afouras J. S. Chung and A. Zisserman. 2018. LRS3-TED: a large-scale dataset for visual speech recognition. In arXiv preprint arXiv:1809.00496.  T. Afouras J. S. Chung and A. Zisserman. 2018. LRS3-TED: a large-scale dataset for visual speech recognition. In arXiv preprint arXiv:1809.00496."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311556"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2013.249"},{"key":"e_1_3_2_2_4_1","volume-title":"Talking-head Generation with Rhythmic Head Motion. In European Conference on Computer Vision. Springer, 35--51","author":"Chen Lele","year":"2020"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"crossref","unstructured":"J. S. Chung A. Nagrani and A. Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In INTERSPEECH.  J. S. Chung A. Nagrani and A. Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In INTERSPEECH.","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"e_1_3_2_2_8_1","volume-title":"Lip Reading in the Wild. In Asian Conference on Computer Vision.","author":"Chung J. S."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"e_1_3_2_2_13_1","volume-title":"Speech Driven Talking Face Generation from a Single Image and an Emotion Condition. arXiv e-prints","author":"Eskimez Sefik Emre","year":"2020"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Yudong Guo Jianfei Cai Boyi Jiang Jianmin Zheng etal 2018. Cnn-based real-time dense face reconstruction with inverse-rendered photo-realistic face images. IEEE transactions on pattern analysis and machine intelligence Vol. 41 6 (2018) 1294--1307.  Yudong Guo Jianfei Cai Boyi Jiang Jianmin Zheng et al. 2018. Cnn-based real-time dense face reconstruction with inverse-rendered photo-realistic face images. IEEE transactions on pattern analysis and machine intelligence Vol. 41 6 (2018) 1294--1307.","DOI":"10.1109\/TPAMI.2018.2837742"},{"key":"e_1_3_2_2_15_1","unstructured":"Awni Hannun Carl Case Jared Casper Bryan Catanzaro Greg Diamos Erich Elsen Ryan Prenger Sanjeev Satheesh Shubho Sengupta Adam Coates etal 2014. Deep speech: Scaling up end-to-end speech recognition. arXiv preprint arXiv:1412.5567 (2014).  Awni Hannun Carl Case Jared Casper Bryan Catanzaro Greg Diamos Erich Elsen Ryan Prenger Sanjeev Satheesh Shubho Sengupta Adam Coates et al. 2014. Deep speech: Scaling up end-to-end speech recognition. arXiv preprint arXiv:1412.5567 (2014)."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_17_1","volume-title":"Image-to-Image Translation with Conditional Adversarial Networks. CVPR","author":"Isola Phillip","year":"2017"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01150-y"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_2_2_22_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS.2009.58"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_2_25_1","unstructured":"ITUT Recommendation. 2006. Vocabulary for performance and quality of service.  ITUT Recommendation. 2006. Vocabulary for performance and quality of service."},{"key":"e_1_3_2_2_26_1","volume-title":"3rd International Conference on Learning Representations, ICLR","author":"Simonyan Karen","year":"2015"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.5555\/2627435.2670313"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_42"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323035"},{"key":"e_1_3_2_2_31_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"e_1_3_2_2_33_1","unstructured":"Kaisiyuan Wang Qianyi Wu Linsen Song Zhuoqian Yang Wayne Wu Chen Qian Ran He Yu Qiao and Chen Change Loy. 2020 b. MEAD: A Large-scale Audio-visual Dataset for Emotional Talking-face Generation. In ECCV.  Kaisiyuan Wang Qianyi Wu Linsen Song Zhuoqian Yang Wayne Wu Chen Qian Ran He Yu Qiao and Chen Change Loy. 2020 b. MEAD: A Large-scale Audio-visual Dataset for Emotional Talking-face Generation. In ECCV."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00917"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_41"},{"key":"e_1_3_2_2_36_1","volume-title":"Audio-driven talking face video generation with learning-based personalized head pose. arXiv e-prints","author":"Yi Ran","year":"2020"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.2973374"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413844"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417774"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475280","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475280","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:17Z","timestamp":1750193297000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475280"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":41,"alternative-id":["10.1145\/3474085.3475280","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475280","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}