{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:04:47Z","timestamp":1750309487158,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680975","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"6123-6132","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Audio-Driven Identity Manipulation for Face Inpainting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7179-5045","authenticated-orcid":false,"given":"Yuqi","family":"Sun","sequence":"first","affiliation":[{"name":"Shanghai Key Laboratory of Intelligent Information Processing, School of Computer Science, Fudan University, Shanghai, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3808-3492","authenticated-orcid":false,"given":"Qing","family":"Lin","sequence":"additional","affiliation":[{"name":"I2R and CFAR, Agency for Science, Technology and Research (A*STAR), Nanyang Technological University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7677-4772","authenticated-orcid":false,"given":"Weimin","family":"Tan","sequence":"additional","affiliation":[{"name":"Shanghai Key Laboratory of Intelligent Information Processing, School of Computer Science, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0256-9682","authenticated-orcid":false,"given":"Bo","family":"Yan","sequence":"additional","affiliation":[{"name":"Shanghai Key Laboratory of Intelligent Information Processing, School of Computer Science, Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587588"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00020"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"e_1_3_2_1_4_1","volume-title":"Voice-Face Homogeneity Tells Deepfake. arXiv preprint arXiv:2203.02195","author":"Cheng Harry","year":"2022","unstructured":"Harry Cheng, Yangyang Guo, Tianyi Wang, Qi Li, Tao Ye, and Liqiang Nie. 2022. Voice-Face Homogeneity Tells Deepfake. arXiv preprint arXiv:2203.02195 (2022)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/SIBGRAPI54419.2021.00040"},{"key":"e_1_3_2_1_6_1","volume-title":"ArcFace: Additive Angular Margin Loss for Deep Face Recognition. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Deng Jiankang","year":"2019","unstructured":"Jiankang Deng, J. Guo, and Stefanos Zafeiriou. 2019. ArcFace: Additive Angular Margin Loss for Deep Face Recognition. 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2019), 4685--4694."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12303"},{"key":"e_1_3_2_1_8_1","volume-title":"Eye In-painting with Exemplar Generative Adversarial Networks. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Dolhansky Brian","year":"2018","unstructured":"Brian Dolhansky and Cristian Canton-Ferrer. 2018. Eye In-painting with Exemplar Generative Adversarial Networks. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018), 7902--7911."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3111648"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Amanda Cardoso Duarte Francisco Roldan Miquel Tubau Janna Escur Santiago Pascual Amaia Salvador Eva Mohedano Kevin McGuinness Jordi Torres and Xavier Giro-i Nieto. 2019. WAV2PIX: Speech-conditioned Face Generation using Generative Adversarial Networks.. In ICASSP. 8633--8637.","DOI":"10.1109\/ICASSP.2019.8682970"},{"key":"e_1_3_2_1_11_1","volume-title":"Taming Transformers for High-Resolution Image Synthesis. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Esser Patrick","year":"2021","unstructured":"Patrick Esser, Robin Rombach, and Bj\u00f6rn Ommer. 2021. Taming Transformers for High-Resolution Image Synthesis. 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021), 12868--12878."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2010-749"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.2967754"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_15_1","volume-title":"VQFR: Blind Face Restoration with Vector-Quantized Dictionary and Parallel Decoder. ArXiv","author":"Gu Yuchao","year":"2022","unstructured":"Yuchao Gu, Xintao Wang, Liangbin Xie, Chao Dong, Gengyan Li, Ying Shan, and Mingg-Ming Cheng. 2022. VQFR: Blind Face Restoration with Vector-Quantized Dictionary and Parallel Decoder. ArXiv, Vol. abs\/2205.06803 (2022)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.4927554"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.207"},{"key":"e_1_3_2_1_18_1","volume-title":"Beyond Face Rotation: Global and Local Perception GAN for Photorealistic and Identity Preserving Frontal View Synthesis. 2017 IEEE International Conference on Computer Vision (ICCV)","author":"Huang Rui","year":"2017","unstructured":"Rui Huang, Shu Zhang, Tianyu Li, and Ran He. 2017. Beyond Face Rotation: Global and Local Perception GAN for Photorealistic and Identity Preserving Frontal View Synthesis. 2017 IEEE International Conference on Computer Vision (ICCV) (2017), 2458--2467."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cub.2003.09.005"},{"key":"e_1_3_2_1_20_1","volume-title":"Montreal,(Report) CRIM-06\/08--13","author":"Kenny Patrick","year":"2005","unstructured":"Patrick Kenny. 2005. Joint factor analysis of speaker and session variability: Theory and algorithms. CRIM, Montreal,(Report) CRIM-06\/08--13, Vol. 14, 28--29 (2005), 2."},{"key":"e_1_3_2_1_21_1","volume-title":"Enhancement and overlap in the speech chain. Language","author":"Keyser Samuel Jay","year":"2006","unstructured":"Samuel Jay Keyser and Kenneth Noble Stevens. 2006. Enhancement and overlap in the speech chain. Language (2006), 33--63."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2008.4761624"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459250"},{"key":"e_1_3_2_1_24_1","volume-title":"ByeGlassesGAN: Identity Preserving Eyeglasses Removal for Face Images. ArXiv","author":"Lee Yu-Hui","year":"2020","unstructured":"Yu-Hui Lee and Shang-Hong Lai. 2020. ByeGlassesGAN: Identity Preserving Eyeglasses Removal for Face Images. ArXiv, Vol. abs\/2008.11042 (2020)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3130196"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00778"},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1869--1878","author":"Li Xiaoguang","year":"2022","unstructured":"Xiaoguang Li, Qing Guo, Di Lin, Ping Li, Wei Feng, and Song Wang. 2022. MISF: Multi-level Interactive Siamese Filtering for High-Fidelity Image Inpainting. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 1869--1878."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475559"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00427"},{"key":"e_1_3_2_1_30_1","volume-title":"Reduce Information Loss in Transformers for Pluralistic Image Inpainting. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Liu Qiankun","year":"2022","unstructured":"Qiankun Liu, Zhentao Tan, Dongdong Chen, Qi Chu, Xiyang Dai, Yinpeng Chen, Mengchen Liu, Lu Yuan, and Nenghai Yu. 2022. Reduce Information Loss in Transformers for Pluralistic Image Inpainting. 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022), 11337--11347."},{"key":"e_1_3_2_1_31_1","volume-title":"SphereFace: Deep Hypersphere Embedding for Face Recognition. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Liu Weiyang","year":"2017","unstructured":"Weiyang Liu, Yandong Wen, Zhiding Yu, Ming Li, Bhiksha Raj, and Le Song. 2017. SphereFace: Deep Hypersphere Embedding for Face Recognition. 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017), 6738--6746."},{"key":"e_1_3_2_1_32_1","volume-title":"ISCA Tutorial and Research Workshop (ITRW) on Speech and Emotion.","author":"McGilloway Sin\u00e9ad","year":"2000","unstructured":"Sin\u00e9ad McGilloway, Roddy Cowie, Ellen Douglas-Cowie, Stan Gielen, Machiel Westerdijk, and Sybert Stroeve. 2000. Approaching automatic recognition of emotion from voice: A rough benchmark. In ISCA Tutorial and Research Workshop (ITRW) on Speech and Emotion."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.1910470"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00772"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Omkar M. Parkhi Andrea Vedaldi and Andrew Zisserman. 2015. Deep Face Recognition. In BMVC.","DOI":"10.5244\/C.29.41"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1044\/jshr.0902.273"},{"key":"e_1_3_2_1_38_1","volume-title":"Speaker identification and verification using Gaussian mixture speaker models. Speech communication","author":"Reynolds Douglas A","year":"1995","unstructured":"Douglas A Reynolds. 1995. Speaker identification and verification using Gaussian mixture speaker models. Speech communication, Vol. 17, 1--2 (1995), 91--108."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00009"},{"key":"e_1_3_2_1_40_1","volume-title":"2020 International Conference on Communications, Signal Processing, and their Applications (ICCSPA)","author":"Salem Nermin M.","year":"2021","unstructured":"Nermin M. Salem, Hani M. K. Mahdi, and Hazem M. Abbas. 2021. A Novel Face Inpainting Approach Based on Guided Deep Learning. 2020 International Conference on Communications, Signal Processing, and their Applications (ICCSPA) (2021), 1--6."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cortex.2010.11.011"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.3758\/s13414-015-1045-8"},{"key":"e_1_3_2_1_43_1","first-page":"2579","article-title":"Visualizing Data using t-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey E. Hinton. 2008. Visualizing Data using t-SNE. Journal of Machine Learning Research, Vol. 9 (2008), 2579--2605.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_44_1","volume-title":"High-Fidelity Pluralistic Image Completion with Transformers. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Wan Ziyu","year":"2021","unstructured":"Ziyu Wan, Jingbo Zhang, Dongdong Chen, and Jing Liao. 2021. High-Fidelity Pluralistic Image Completion with Transformers. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021), 4672--4681."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00465"},{"key":"e_1_3_2_1_46_1","volume-title":"CosFace: Large Margin Cosine Loss for Deep Face Recognition. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Wang H.","year":"2018","unstructured":"H. Wang, Yitong Wang, Zheng Zhou, Xing Ji, Zhifeng Li, Dihong Gong, Jin Zhou, and Wei Liu. 2018. CosFace: Large Margin Cosine Loss for Deep Face Recognition. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018), 5265--5274."},{"key":"e_1_3_2_1_47_1","volume-title":"Face reconstruction from voice using generative adversarial networks. Advances in neural information processing systems","author":"Wen Yandong","year":"2019","unstructured":"Yandong Wen, Bhiksha Raj, and Rita Singh. 2019. Face reconstruction from voice using generative adversarial networks. Advances in neural information processing systems, Vol. 32 (2019)."},{"volume-title":"Face Reconstruction from Voice Using Generative Adversarial Networks","author":"Wen Yandong","key":"e_1_3_2_1_48_1","unstructured":"Yandong Wen, Rita Singh, and Bhiksha Raj. 2019. Face Reconstruction from Voice Using Generative Adversarial Networks. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01352"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2816163"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_2_1_52_1","volume-title":"Identity Preserving Face Completion for Large Ocular Region Occlusion. ArXiv","author":"Zhao Yajie","year":"2018","unstructured":"Yajie Zhao, Weikai Chen, Jun Xing, Xiaoming Li, Zachary Bessinger, Fuchang Liu, Wangmeng Zuo, and Ruigang Yang. 2018. Identity Preserving Face Completion for Large Ocular Region Occlusion. ArXiv, Vol. abs\/1807.08772 (2018)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680975","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680975","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:35Z","timestamp":1750295855000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680975"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":54,"alternative-id":["10.1145\/3664647.3680975","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680975","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}