{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:05:21Z","timestamp":1776888321823,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3613825","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"8443-8452","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Face-Driven Zero-Shot Voice Conversion with Memory-based Face-Voice Alignment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0638-5530","authenticated-orcid":false,"given":"Zheng-Yan","family":"Sheng","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6668-022X","authenticated-orcid":false,"given":"Yang","family":"Ai","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0200-9167","authenticated-orcid":false,"given":"Yan-Nian","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7853-5273","authenticated-orcid":false,"given":"Zhen-Hua","family":"Ling","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the Empirical Methods in Natural Language Processing (EMNLP). 66--71","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, Andrew Senior, Oriol Vinyals, and Andrew Zisserman. 2018. LRS3-TED: A large-Scale dataset for visual speech recognition. In Proceedings of the Empirical Methods in Natural Language Processing (EMNLP). 66--71."},{"key":"e_1_3_2_1_2_1","volume-title":"Vq-wav2vec: self-supervised learning of discrete speech representations. arXiv preprint arXiv:1910.05453","author":"Baevski Alexei","year":"2019","unstructured":"Alexei Baevski, Steffen Schneider, and Michael Auli. 2019. Vq-wav2vec: self-supervised learning of discrete speech representations. arXiv preprint arXiv:1910.05453 (2019)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547850"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2019.04.008"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00802"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747140"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the International conference on machine learning (ICML). PMLR, 1779--1788","author":"Cheng Pengyu","year":"2020","unstructured":"Pengyu Cheng, Weituo Hao, Shuyang Dai, Jiachang Liu, Zhe Gan, and Lawrence Carin. 2020. Club: A contrastive log-ratio upper bound of mutual information. In Proceedings of the International conference on machine learning (ICML). PMLR, 1779--1788."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Choi Hyeong-Seok","year":"2020","unstructured":"Hyeong-Seok Choi, Changdae Park, and Kyogu Lee. 2020. From inference to generation: End-to-end fully self-supervised generation of human face from speech. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461471"},{"key":"e_1_3_2_1_10_1","volume-title":"IQDUBBING: Prosody modeling based on discrete self-supervised speech representation for expressive voice conversion. arXiv preprint arXiv:2201.00269","author":"Gan Wendong","year":"2022","unstructured":"Wendong Gan, Bolong Wen, Ying Yan, Haitao Chen, Zhichao Wang, Hongqiang Du, Lei Xie, Kaixuan Guo, and Hai Li. 2022. IQDUBBING: Prosody modeling based on discrete self-supervised speech representation for expressive voice conversion. arXiv preprint arXiv:2201.00269 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2136"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cub.2003.09.005"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639535"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2917232"},{"key":"e_1_3_2_1_15_1","volume-title":"Yasunori Ohishi, and Takuhiro Kaneko.","author":"Kameoka Hirokazu","year":"2019","unstructured":"Hirokazu Kameoka, Kou Tanaka, Aaron Valero Puche, Yasunori Ohishi, and Takuhiro Kaneko. 2019b. Crossmodal voice conversion. arXiv preprint arXiv:1904.04540 (2019)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO.2018.8553236"},{"key":"e_1_3_2_1_17_1","first-page":"2758","article-title":"Lip to speech synthesis with visual context attentional GAN","volume":"34","author":"Kim Minsu","year":"2021","unstructured":"Minsu Kim, Joanna Hong, and Yong Man Ro. 2021. Lip to speech synthesis with visual context attentional GAN. Proceedings of the Neural Information Processing Systems (NeurIPS), Vol. 34 (2021), 2758--2770.","journal-title":"Proceedings of the Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_18_1","first-page":"294","article-title":"Voicemixer: Adversarial voice style mixup","volume":"34","author":"Lee Sang-Hoon","year":"2021","unstructured":"Sang-Hoon Lee, Ji-Hoon Kim, Hyunseung Chung, and Seong-Whan Lee. 2021. Voicemixer: Adversarial voice style mixup. Proceedings of the Neural Information Processing Systems (NeurIPS), Vol. 34, 294--308.","journal-title":"Proceedings of the Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747272"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475198"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1037\/a0030945"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2017.01.008"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1798"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the Audio for Games. Audio Engineering Society.","author":"Morise Masanori","year":"2009","unstructured":"Masanori Morise, Hideki Kawahara, and Haruhiro Katayose. 2009. Fast and reliable F0 estimation method based on the period extraction of vocal fold vibration of singing voice and speech. In Proceedings of the Audio for Games. Audio Engineering Society."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00772"},{"key":"e_1_3_2_1_26_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20102"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9687866"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML). PMLR, 5210--5219","author":"Qian Kaizhi","year":"2019","unstructured":"Kaizhi Qian, Yang Zhang, Shiyu Chang, Xuesong Yang, and Mark Hasegawa-Johnson. 2019. Autovc: Zero-shot voice style transfer with only autoencoder loss. In Proceedings of the International Conference on Machine Learning (ICML). PMLR, 5210--5219."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461384"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3038524"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1177\/1474704916630317"},{"key":"e_1_3_2_1_35_1","volume-title":"Talking face generation by conditional recurrent adversarial network. arXiv preprint arXiv:1804.04786","author":"Song Yang","year":"2018","unstructured":"Yang Song, Jingwen Zhu, Dawei Li, Xiaolong Wang, and Hairong Qi. 2018. Talking face generation by conditional recurrent adversarial network. arXiv preprint arXiv:1804.04786 (2018)."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2802--2806","author":"Lal Brij Mohan","year":"2020","unstructured":"Brij Mohan Lal Srivastava, Nathalie Vauquier, Md Sahidullah, Aur\u00e9lien Bellet, Marc Tommasi, and Emmanuel Vincent. 2020. Evaluating voice conversion-based privacy protection against informed attackers. In Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2802--2806."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.5555\/1134782.1134792"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1693"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the Fourth Workshop on Speech and Language Processing for Assistive Technologies. 107--111","author":"Veaux Christophe","year":"2013","unstructured":"Christophe Veaux, Junichi Yamagishi, and Simon King. 2013. Towards personalised synthesised voices for individuals with vocal disabilities: Voice banking and reconstruction. In Proceedings of the Fourth Workshop on Speech and Language Processing for Assistive Technologies. 107--111."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"e_1_3_2_1_41_1","volume-title":"Xiao Chen, Xunying Liu, and Helen Meng.","author":"Wang Disong","year":"2021","unstructured":"Disong Wang, Liqun Deng, Yu Ting Yeung, Xiao Chen, Xunying Liu, and Helen Meng. 2021. VQMIVC: vector quantization and mutual information-based unsupervised speech representation disentanglement for one-shot voice conversion. In Proceedings of the International Speech Communication Association (INTERSPEECH). 1344--1348."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747427"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746808"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053842"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the neural information processing systems (NeurIPS), 5265--5274","author":"Wen Yandong","year":"2019","unstructured":"Yandong Wen, Bhiksha Raj, and Rita Singh. 2019. Face reconstruction from voice using generative adversarial networks. Proceedings of the neural information processing systems (NeurIPS), 5265--5274."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01020"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-022-03227-7"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"SiCheng Yang Methawee Tantrawenith Haolin Zhuang Zhiyong Wu Aolan Sun Jianzong Wang Ning Cheng Huaizhen Tang Xintao Zhao Jie Wang et al. 2022a. Speech representation disentanglement with adversarial mutual information learning for one-shot voice conversion. arXiv preprint arXiv:2208.08757 (2022).","DOI":"10.21437\/Interspeech.2022-571"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN55064.2022.9892169"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Zhihan Yang Zhiyong Wu Ying Shan and Jia Jia. 2023. What does your face sound like? 3D face shape towards voice. (2023).","DOI":"10.1609\/aaai.v37i11.26628"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR).","author":"Yuan Siyang","year":"2021","unstructured":"Siyang Yuan, Pengyu Cheng, Ruiyi Zhang, Weituo Hao, Zhe Gan, and Lawrence Carin. 2021. Improving zero-shot voice style transfer via disentangled representation learning. In Proceedings of the International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2960721"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613825","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3613825","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:03:35Z","timestamp":1755821015000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3613825"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":56,"alternative-id":["10.1145\/3581783.3613825","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3613825","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}