{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T16:10:00Z","timestamp":1772295000668,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["Grant No. 2021YFC3320103"],"award-info":[{"award-number":["Grant No. 2021YFC3320103"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Natural Science Foundation of China (NSFC)","award":["Grants 61972395, 62272460, U19B2038"],"award-info":[{"award-number":["Grants 61972395, 62272460, U19B2038"]}]},{"name":"Beijing Natural Science Foundation","award":["Grant No. 4232037"],"award-info":[{"award-number":["Grant No. 4232037"]}]},{"name":"Young Elite Scientists Sponsorship Program by CAST (YESS)"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611765","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"7718-7727","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Context-Aware Talking-Head Video Editing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3403-376X","authenticated-orcid":false,"given":"Songlin","family":"Yang","sequence":"first","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8598-0831","authenticated-orcid":false,"given":"Wei","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7260-7141","authenticated-orcid":false,"given":"Jun","family":"Ling","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9014-7369","authenticated-orcid":false,"given":"Bo","family":"Peng","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5631-0639","authenticated-orcid":false,"given":"Xu","family":"Tan","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2763-7832","authenticated-orcid":false,"given":"Jing","family":"Dong","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"International conference on machine learning. PMLR, 173--182","author":"Amodei Dario","year":"2016","unstructured":"Dario Amodei, Sundaram Ananthanarayanan, Rishita Anubhai, Jingliang Bai, Eric Battenberg, Carl Case, Jared Casper, Bryan Catanzaro, Qiang Cheng, Guoliang Chen, et al. 2016. Deep speech 2: End-to-end speech recognition in english and mandarin. In International conference on machine learning. PMLR, 173--182."},{"key":"e_1_3_2_1_2_1","volume-title":"GPU Technology Conference","author":"Assael Yannis M","year":"2016","unstructured":"Yannis M Assael, Brendan Shillingford, Shimon Whiteson, and Nando De Freitas. 2016. Lipnet: End-to-end sentence-level lipreading. GPU Technology Conference (2016)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2185520.2185563"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/311535.311556"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.116"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"e_1_3_2_1_8_1","volume-title":"VideoReTalking: Audio-based Lip Synchronization for Talking Head Video Editing In the Wild. In SIGGRAPH Asia 2022 Conference Papers. 1--9.","author":"Cheng Kun","year":"2022","unstructured":"Kun Cheng, Xiaodong Cun, Yong Zhang, Menghan Xia, Fei Yin, Mingrui Zhu, Xuan Wang, Jue Wang, and Nannan Wang. 2022. VideoReTalking: Audio-based Lip Synchronization for Talking Head Video Editing In the Wild. In SIGGRAPH Asia 2022 Conference Papers. 1--9."},{"key":"e_1_3_2_1_9_1","volume-title":"Out of time: automated lip sync in the wild","author":"Chung Joon Son","unstructured":"Joon Son Chung and Andrew Zisserman. 2016. Out of time: automated lip sync in the wild. In ACCV. Springer, 251--263."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.2229005"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_25"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459936"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323028"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00854"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01810"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00500"},{"key":"e_1_3_2_1_19_1","unstructured":"Awni Hannun Carl Case Jared Casper Bryan Catanzaro Greg Diamos Erich Elsen Ryan Prenger Sanjeev Satheesh Shubho Sengupta Adam Coates et al. 2014. Deep speech: Scaling up end-to-end speech recognition. arXiv preprint arXiv:1412.5567 (2014)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01150-y"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"e_1_3_2_1_22_1","volume-title":"Ray tracing","author":"Kajiya James T","year":"1984","unstructured":"James T Kajiya and Brian P Von Herzen. 1984. Ray tracing volume densities. ACM SIGGRAPH computer graphics, Vol. 18, 3 (1984), 165--174."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00278"},{"key":"e_1_3_2_1_24_1","volume-title":"StableFace: Analyzing and Improving Motion Stability for Talking Face Generation. arXiv preprint arXiv:2208.13717","author":"Ling Jun","year":"2022","unstructured":"Jun Ling, Xu Tan, Liyang Chen, Runnan Li, Yuchao Zhang, Sheng Zhao, and Li Song. 2022. StableFace: Analyzing and Improving Motion Stability for Talking Face Generation. arXiv preprint arXiv:2208.13717 (2022)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_7"},{"key":"e_1_3_2_1_26_1","first-page":"11525","article-title":"Object-centric learning with slot attention","volume":"33","author":"Locatello Francesco","year":"2020","unstructured":"Francesco Locatello, Dirk Weissenborn, Thomas Unterthiner, Aravindh Mahendran, Georg Heigold, Jakob Uszkoreit, Alexey Dosovitskiy, and Thomas Kipf. 2020. Object-centric learning with slot attention. Advances in Neural Information Processing Systems, Vol. 33 (2020), 11525--11538.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475531"},{"key":"e_1_3_2_1_28_1","volume-title":"Semantic 3D-aware Portrait Synthesis and Manipulation Based on Compositional Neural Radiance Field. arXiv preprint arXiv:2302.01579","author":"Ma Tianxiang","year":"2023","unstructured":"Tianxiang Ma, Bingchuan Li, Qian He, Jing Dong, and Tieniu Tan. 2023. Semantic 3D-aware Portrait Synthesis and Manipulation Based on Compositional Neural Radiance Field. arXiv preprint arXiv:2302.01579 (2023)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00581"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.287"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01018"},{"key":"e_1_3_2_1_34_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Ren Yi","year":"2019","unstructured":"Yi Ren, Yangjun Ruan, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2019. Fastspeech: Fast, robust and controllable text to speech. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_35_1","volume-title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. CoRR","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc V. Le, Geoffrey E. Hinton, and Jeff Dean. 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. CoRR, Vol. abs\/1701.06538 (2017). [arXiv]1701.06538"},{"key":"e_1_3_2_1_36_1","volume-title":"Learning Dynamic Facial Radiance Fields for Few-Shot Talking Head Synthesis. In European conference on computer vision.","author":"Shen Shuai","year":"2022","unstructured":"Shuai Shen, Wanhua Li, Zheng Zhu, Yueqi Duan, Jie Zhou, and Jiwen Lu. 2022. Learning Dynamic Facial Radiance Fields for Few-Shot Talking Head Synthesis. In European conference on computer vision."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475196"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/129"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_2_1_40_1","unstructured":"Xu Tan Jiawei Chen Haohe Liu Jian Cong Chen Zhang Yanqing Liu Xi Wang Yichong Leng Yuanhao Yi Lei He et al. 2022. NaturalSpeech: End-to-End Text to Speech Synthesis with Human-Level Quality. arXiv preprint arXiv:2205.04421 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Zero-Shot Text-to-Speech for Text-Based Insertion in Audio Narration. Interspeech","author":"Tang Chuanxin","year":"2021","unstructured":"Chuanxin Tang, Chong Luo, Zhiyuan Zhao, Dacheng Yin, Yucheng Zhao, and Wenjun Zeng. 2021. Zero-Shot Text-to-Speech for Text-Based Insertion in Audio Narration. Interspeech (2021)."},{"key":"e_1_3_2_1_42_1","volume-title":"Real-time neural radiance talking portrait synthesis via audio-spatial decomposition. arXiv preprint arXiv:2211.12368","author":"Tang Jiaxiang","year":"2022","unstructured":"Jiaxiang Tang, Kaisiyuan Wang, Hang Zhou, Xiaokang Chen, Dongliang He, Tianshu Hu, Jingtuo Liu, Gang Zeng, and Jingdong Wang. 2022. Real-time neural radiance talking portrait synthesis via audio-spatial decomposition. arXiv preprint arXiv:2211.12368 (2022)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073699"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/2929464.2929475"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01272"},{"key":"e_1_3_2_1_46_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01251-8"},{"key":"e_1_3_2_1_48_1","volume-title":"Audio2head: Audio-driven one-shot talking-head generation with natural head motion. IJCAI","author":"Wang Suzhen","year":"2021","unstructured":"Suzhen Wang, Lincheng Li, Yu Ding, Changjie Fan, and Xin Yu. 2021. Audio2head: Audio-driven one-shot talking-head generation with natural head motion. IJCAI (2021)."},{"key":"e_1_3_2_1_49_1","first-page":"1802","article-title":"Real-time synthesis of Chinese visual speech and facial expressions using MPEG-4 FAP features in a three-dimensional avatar","volume":"4","author":"Wu Zhiyong","year":"2006","unstructured":"Zhiyong Wu, Shen Zhang, Lianhong Cai, and Helen M Meng. 2006. Real-time synthesis of Chinese visual speech and facial expressions using MPEG-4 FAP features in a three-dimensional avatar. Interspeech, Vol. 4 (2006), 1802--1805.","journal-title":"Interspeech"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3449063"},{"key":"e_1_3_2_1_51_1","volume-title":"RetrieverTTS: Modeling Decomposed Factors for Text-Based Speech Insertion. Interspeech","author":"Yin Dacheng","year":"2022","unstructured":"Dacheng Yin, Chuanxin Tang, Yanqing Liu, Xiaoqiang Wang, Zhiyuan Zhao, Yucheng Zhao, Zhiwei Xiong, Sheng Zhao, and Chong Luo. 2022. RetrieverTTS: Modeling Decomposed Factors for Text-Based Speech Insertion. Interspeech (2022)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_20"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13545--13555","author":"Zheng Yufeng","year":"2022","unstructured":"Yufeng Zheng, Victoria Fern\u00e1ndez Abrevaya, Marcel C B\u00fchler, Xu Chen, Michael J Black, and Otmar Hilliges. 2022. Im avatar: Implicit morphable head avatars from videos. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13545--13555."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3414685.3417774","article-title":"Makelttalk: speaker-aware talking-head animation","volume":"39","author":"Zhou Yang","year":"2020","unstructured":"Yang Zhou, Xintong Han, Eli Shechtman, Jose Echevarria, Evangelos Kalogerakis, and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions on Graphics (TOG), Vol. 39, 6 (2020), 1--15.","journal-title":"ACM Transactions on Graphics (TOG)"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Hao Zhu Huaibo Huang Yi Li Aihua Zheng and Ran He. 2020. Arbitrary talking face generation via attentional audio-visual coherence learning. IJCAI (2020)","DOI":"10.24963\/ijcai.2020\/327"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611765","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611765","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:13:28Z","timestamp":1755821608000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611765"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":58,"alternative-id":["10.1145\/3581783.3611765","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611765","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}