{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T03:35:55Z","timestamp":1773200155740,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681108","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"3170-3179","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":8,"title":["SegTalker: Segmentation-based Talking Face Generation with Mask-guided Local Editing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9453-5423","authenticated-orcid":false,"given":"Lingyu","family":"Xiong","sequence":"first","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9708-3225","authenticated-orcid":false,"given":"Xize","family":"Cheng","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7777-2162","authenticated-orcid":false,"given":"Jintao","family":"Tan","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8742-5339","authenticated-orcid":false,"given":"Xianjia","family":"Wu","sequence":"additional","affiliation":[{"name":"Huawei Cloud Computing Technologies Co., Ltd, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7724-3966","authenticated-orcid":false,"given":"Xiandong","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Cloud Computing Technologies Co., Ltd, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4795-9670","authenticated-orcid":false,"given":"Lei","family":"Zhu","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5388-9125","authenticated-orcid":false,"given":"Fei","family":"Ma","sequence":"additional","affiliation":[{"name":"Guangdong Laboratory of Artificial Intelligence and Digital Economy (SZ), Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1427-3507","authenticated-orcid":false,"given":"Minglei","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Cloud Computing Technologies Co., Ltd, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5405-5467","authenticated-orcid":false,"given":"Huang","family":"Xu","sequence":"additional","affiliation":[{"name":"Huawei Cloud Computing Technologies Co., Ltd, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3336-1817","authenticated-orcid":false,"given":"Zhihui","family":"Hu","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00453"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00832"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00664"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01796"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548101"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555399"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Xize Cheng Rongjie Huang Linjun Li Tao Jin Zehan Wang Aoxiong Yin Minglei Li Xinyu Duan Zhou Zhao et al. 2023. TransFace: Unit-Based Audio-Visual Speech Synthesizer for Talking Head Translation. arXiv preprint arXiv:2312.15197 (2023).","DOI":"10.18653\/v1\/2024.findings-acl.593"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01442"},{"key":"e_1_3_2_1_10_1","volume-title":"Opensr: Open-modality speech recognition via maintaining multi-modality alignment. arXiv preprint arXiv:2306.06410","author":"Cheng Xize","year":"2023","unstructured":"Xize Cheng, Tao Jin, Linjun Li, Wang Lin, Xinyu Duan, and Zhou Zhao. 2023. Opensr: Open-modality speech recognition via maintaining multi-modality alignment. arXiv preprint arXiv:2306.06410 (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Out of Time: Automated Lip Sync in the Wild. In ACCV Workshops.","author":"Chung Joon Son","year":"2016","unstructured":"Joon Son Chung and Andrew Zisserman. 2016. Out of Time: Automated Lip Sync in the Wild. In ACCV Workshops."},{"key":"e_1_3_2_1_12_1","volume-title":"Generative adversarial nets. Advances in neural information processing systems","author":"Goodfellow Ian","year":"2014","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial nets. Advances in neural information processing systems, Vol. 27 (2014)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"e_1_3_2_1_14_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_15_1","volume-title":"Av-transpeech: Audio-visual robust speech-to-speech translation. arXiv preprint arXiv:2305.15403","author":"Huang Rongjie","year":"2023","unstructured":"Rongjie Huang, Huadai Liu, Xize Cheng, Yi Ren, Linjun Li, Zhenhui Ye, Jinzheng He, Lichao Zhang, Jinglin Liu, Xiang Yin, et al. 2023. Av-transpeech: Audio-visual robust speech-to-speech translation. arXiv preprint arXiv:2305.15403 (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612291"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01368"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00225"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00829"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01357"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555436"},{"key":"e_1_3_2_1_24_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00232"},{"key":"e_1_3_2_1_27_1","volume-title":"Pivotal tuning for latent-based editing of real images. ACM Transactions on graphics (TOG)","author":"Roich Daniel","year":"2022","unstructured":"Daniel Roich, Ron Mokady, Amit H Bermano, and Daniel Cohen-Or. 2022. Pivotal tuning for latent-based editing of real images. ACM Transactions on graphics (TOG), Vol. 42, 1 (2022), 1--13."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_29_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation. In Medical Image Computing and Computer-Assisted Intervention. 234--241.","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In Medical Image Computing and Computer-Assisted Intervention. 234--241."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00202"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_39"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00197"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00926"},{"key":"e_1_3_2_1_34_1","volume-title":"Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction. arXiv preprint arXiv:2201.02184","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, and Abdelrahman Mohamed. 2022. Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction. arXiv preprint arXiv:2201.02184 (2022)."},{"key":"e_1_3_2_1_35_1","volume-title":"Robust Self-Supervised Audio-Visual Speech Recognition. arXiv preprint arXiv:2201.01763","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, and Abdelrahman Mohamed. 2022. Robust Self-Supervised Audio-Visual Speech Recognition. arXiv preprint arXiv:2201.01763 (2022)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2022.3146783"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_2_1_38_1","volume-title":"Landmark-guided Diffusion Model for High-fidelity and Temporally Coherent Talking Head Generation. arXiv preprint arXiv:2408.01732","author":"Tan Jintao","year":"2024","unstructured":"Jintao Tan, Xize Cheng, Lingyu Xiong, Lei Zhu, Xiandong Li, Xianjia Wu, Kai Gong, Minglei Li, and Yi Cai. 2024. Landmark-guided Diffusion Model for High-fidelity and Temporally Coherent Talking Head Generation. arXiv preprint arXiv:2408.01732 (2024)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28313"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459838"},{"key":"e_1_3_2_1_41_1","volume-title":"ICLR Workshop on Deep Generative Models for Highly Structured Data.","author":"Unterthiner Thomas","year":"2019","unstructured":"Thomas Unterthiner, Sjoerd van Steenkiste, Karol Kurach, Rapha\u00ebl Marinier, Marcin Michalski, and Sylvain Gelly. 2019. FVD: A new Metric for Video Generation. In ICLR Workshop on Deep Generative Models for Highly Structured Data."},{"key":"e_1_3_2_1_42_1","first-page":"2579","article-title":"Visualizing Data using t-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey Hinton. 2008. Visualizing Data using t-SNE. Journal of Machine Learning Research, Vol. 9, 86 (2008), 2579--2605.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01408"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"e_1_3_2_1_45_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing, Vol. 13, 4 (2004), 600--612."},{"key":"e_1_3_2_1_46_1","unstructured":"Zehan Wang Ziang Zhang Xize Cheng Rongjie Huang Luping Liu Zhenhui Ye Haifeng Huang Yang Zhao Tao Jin Peng Gao et al. 2024. Molecule-Space: Free Lunch in Unified Multimodal Space via Knowledge Fusion. arXiv preprint arXiv:2405.04883 (2024)."},{"key":"e_1_3_2_1_47_1","first-page":"22099","article-title":"Connecting multi-modal contrastive representations","volume":"36","author":"Wang Zehan","year":"2023","unstructured":"Zehan Wang, Yang Zhao, Haifeng Huang, Jiageng Liu, Aoxiong Yin, Li Tang, Linjun Li, Yongqi Wang, Ziang Zhang, and Zhou Zhao. 2023. Connecting multi-modal contrastive representations. Advances in Neural Information Processing Systems, Vol. 36 (2023), 22099--22114.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26613"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01267"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01259"},{"key":"e_1_3_2_1_51_1","volume-title":"Feature-style encoder for style-based gan inversion. arXiv preprint arXiv:2202.02183","author":"Yao Xu","year":"2022","unstructured":"Xu Yao, Alasdair Newson, Yann Gousseau, and Pierre Hellier. 2022. Feature-style encoder for style-based gan inversion. arXiv preprint arXiv:2202.02183 (2022)."},{"key":"e_1_3_2_1_52_1","volume-title":"StyleHEAT: One-Shot High-Resolution Editable Talking Face Generation via Pre-trained StyleGAN. arxiv:2203.04036","author":"Yin Fei","year":"2022","unstructured":"Fei Yin, Yong Zhang, Xiaodong Cun, Mingdeng Cao, Yanbo Fan, Xuan Wang, Qingyan Bai, Baoyuan Wu, Jue Wang, and Yujiu Yang. 2022. StyleHEAT: One-Shot High-Resolution Editable Talking Face Generation via Pre-trained StyleGAN. arxiv:2203.04036 (2022)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_20"},{"key":"e_1_3_2_1_54_1","volume-title":"MyStyle: A Controllable Personalized Generative Prior. arXiv preprint arXiv:2306.04865","author":"Zeng Libing","year":"2023","unstructured":"Libing Zeng, Lele Chen, Yi Xu, and Nima Kalantari. 2023. MyStyle: A Controllable Personalized Generative Prior. arXiv preprint arXiv:2306.04865 (2023)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Richard Zhang Phillip Isola Alexei A Efros Eli Shechtman and Oliver Wang. 2018. The Unreasonable Effectiveness of Deep Features as a Perceptual Metric. In CVPR.","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3414685.3417774","article-title":"Makelttalk: speaker-aware talking-head animation","volume":"39","author":"Zhou Yang","year":"2020","unstructured":"Yang Zhou, Xintong Han, Eli Shechtman, Jose Echevarria, Evangelos Kalogerakis, and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions On Graphics (TOG), Vol. 39, 6 (2020), 1--15.","journal-title":"ACM Transactions On Graphics (TOG)"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681108","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681108","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:53Z","timestamp":1750294673000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681108"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":59,"alternative-id":["10.1145\/3664647.3681108","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681108","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}