{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:06:47Z","timestamp":1765343207225,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","funder":[{"name":"the National Key Research and Development Program of China","award":["2023ZD0121402"],"award-info":[{"award-number":["2023ZD0121402"]}]},{"name":"the Shanghai Science and Technology Commission Blockchain Special Project","award":["24BC3200100"],"award-info":[{"award-number":["24BC3200100"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755678","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"2104-2112","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Visual-informed Silent Video Identity Conversion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-3014-1952","authenticated-orcid":false,"given":"Yifan","family":"Liu","sequence":"first","affiliation":[{"name":"LUMIA Lab, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4682-1314","authenticated-orcid":false,"given":"Yu","family":"Fang","sequence":"additional","affiliation":[{"name":"LUMIA Lab, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7204-0689","authenticated-orcid":false,"given":"Zhouhan","family":"Lin","sequence":"additional","affiliation":[{"name":"LUMIA Lab, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018. LRS3-TED: a large-scale dataset for visual speech recognition. ArXiv, Vol. abs\/1809.00496 (2018). https:\/\/api.semanticscholar.org\/CorpusID:52155419"},{"key":"e_1_3_2_2_2_1","volume-title":"CLUB: A Contrastive Log-ratio Upper Bound of Mutual Information. arXiv:2006.12013 [cs.LG] https:\/\/arxiv.org\/abs\/2006.12013","author":"Cheng Pengyu","year":"2020","unstructured":"Pengyu Cheng, Weituo Hao, Shuyang Dai, Jiachang Liu, Zhe Gan, and Lawrence Carin. 2020. CLUB: A Contrastive Log-ratio Upper Bound of Mutual Information. arXiv:2006.12013 [cs.LG] https:\/\/arxiv.org\/abs\/2006.12013"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Jeongsoo Choi Joanna Hong and Yong Man Ro. 2023a. DiffV2S: Diffusion-based Video-to-Speech Synthesis with Vision-guided Speaker Embedding. arXiv:2308.07787 [cs.SD]","DOI":"10.1109\/ICCV51070.2023.00718"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Jeongsoo Choi Minsu Kim and Yong Man Ro. 2023b. Intelligible Lip-to-Speech Synthesis with Speech Units. arXiv:2305.19603 [cs.SD]","DOI":"10.21437\/Interspeech.2023-194"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Joon Son Chung Arsha Nagrani and Andrew Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In Interspeech. https:\/\/api.semanticscholar.org\/CorpusID:49211906","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"e_1_3_2_2_6_1","volume-title":"Shah Nawaz, Muhammad Irzam Liaqat, Markus Schedl, and Mubashir Noman.","author":"Hannan Abdul","year":"2025","unstructured":"Abdul Hannan, Muhammad Arslan Manzoor, Shah Nawaz, Muhammad Irzam Liaqat, Markus Schedl, and Mubashir Noman. 2025. PAEFF: Precise Alignment and Enhanced Gated Feature Fusion for Face-Voice Association. arXiv:2505.17002 [cs.CV] https:\/\/arxiv.org\/abs\/2505.17002"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","unstructured":"Wei-Ning Hsu Tal Remez Bowen Shi Jacob Donley and Yossi Adi. 2023. ReVISE: Self-Supervised Speech Resynthesis with Visual Input for Universal and Generalized Speech Regeneration. In 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 18796-18806. doi:10.1109\/CVPR52729.2023.01802","DOI":"10.1109\/CVPR52729.2023.01802"},{"key":"e_1_3_2_2_8_1","first-page":"1","volume-title":"Lip-to-Speech Synthesis in the Wild with Multi-Task Learning. ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2023","author":"Kim Minsu","year":"2023","unstructured":"Minsu Kim, Joanna Hong, and Yong Man Ro. 2023. Lip-to-Speech Synthesis in the Wild with Multi-Task Learning. ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2023), 1-5. https:\/\/api.semanticscholar.org\/CorpusID:257019598"},{"key":"e_1_3_2_2_9_1","unstructured":"Jungil Kong Jaehyeon Kim and Jaekyoung Bae. 2020. HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis. arXiv:2010.05646 [cs.SD]"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-232"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095191"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Yifan Liu Yu Fang and Zhouhan Lin. 2025. DiVISe: Direct Visual-Input Speech Synthesis Preserving Speaker Characteristics And Intelligibility. arXiv:2503.05223 [cs.SD] https:\/\/arxiv.org\/abs\/2503.05223","DOI":"10.18653\/v1\/2025.findings-naacl.130"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475198"},{"key":"e_1_3_2_2_14_1","volume-title":"SVTS: Scalable Video-to-Speech Synthesis. arXiv:2205.02058 [cs.SD]","author":"Mira Rodrigo","year":"2022","unstructured":"Rodrigo Mira, Alexandros Haliassos, Stavros Petridis, Bj\u00f6rn W. Schuller, and Maja Pantic. 2022. SVTS: Scalable Video-to-Speech Synthesis. arXiv:2205.02058 [cs.SD]"},{"key":"e_1_3_2_2_15_1","volume-title":"NISQA: A Deep CNN-Self-Attention Model for Multidimensional Speech Quality Prediction with Crowdsourced Datasets. In Interspeech. https:\/\/api.semanticscholar.org\/CorpusID:233296150","author":"Mittag Gabriel","year":"2021","unstructured":"Gabriel Mittag, Babak Naderi, Assmaa Chehadi, and Sebastian M\u00f6ller. 2021. NISQA: A Deep CNN-Self-Attention Model for Multidimensional Speech Quality Prediction with Crowdsourced Datasets. In Interspeech. https:\/\/api.semanticscholar.org\/CorpusID:233296150"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","unstructured":"K R Prajwal Rudrabha Mukhopadhyay Vinay Namboodiri and C V Jawahar. 2020a. Learning Individual Speaking Styles for Accurate Lip to Speech Synthesis. arXiv:2005.08209 [cs.CV]","DOI":"10.1109\/CVPR42600.2020.01381"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_2_18_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020 [cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_2_19_1","unstructured":"Yan Rong and Li Liu. 2024. Seeing Your Speech Style: A Novel Zero-Shot Identity-Disentanglement Face-based Voice Conversion. arXiv:2409.00700 [cs.SD] https:\/\/arxiv.org\/abs\/2409.00700"},{"key":"e_1_3_2_2_20_1","volume-title":"Shah Nawaz, Muhammad Haroon Yousaf, and Alessio Del Bue.","author":"Saeed Muhammad Saad","year":"2021","unstructured":"Muhammad Saad Saeed, Muhammad Haris Khan, Shah Nawaz, Muhammad Haroon Yousaf, and Alessio Del Bue. 2021. Fusion and Orthogonal Projection for Improved Face-Voice Association. arXiv:2112.10483 [cs.CV] https:\/\/arxiv.org\/abs\/2112.10483"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613825"},{"key":"e_1_3_2_2_22_1","volume-title":"Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction. ArXiv","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, and Abdel rahman Mohamed. 2022a. Learning Audio-Visual Speech Representation by Masked Multimodal Cluster Prediction. ArXiv, Vol. abs\/2201.02184 (2022). https:\/\/api.semanticscholar.org\/CorpusID:245769552"},{"key":"e_1_3_2_2_23_1","volume-title":"Learning lip-based audio-visual speaker embeddings with av-hubert. arXiv preprint arXiv:2205.07180","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Abdelrahman Mohamed, and Wei-Ning Hsu. 2022b. Learning lip-based audio-visual speaker embeddings with av-hubert. arXiv preprint arXiv:2205.07180 (2022)."},{"key":"e_1_3_2_2_24_1","volume-title":"Musan: A music, speech, and noise corpus. arXiv preprint arXiv:1510.08484","author":"Snyder David","year":"2015","unstructured":"David Snyder, Guoguo Chen, and Daniel Povey. 2015. Musan: A music, speech, and noise corpus. arXiv preprint arXiv:1510.08484 (2015)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-283"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26607"},{"key":"e_1_3_2_2_28_1","unstructured":"Yochai Yemini Aviv Shamsian Lior Bracha Sharon Gannot and Ethan Fetaya. 2024. LipVoicer: Generating Speech from Silent Videos Guided by Lip Reading. arXiv:2306.03258 [eess.AS] https:\/\/arxiv.org\/abs\/2306.03258"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755678","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:03:52Z","timestamp":1765343032000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755678"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":28,"alternative-id":["10.1145\/3746027.3755678","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755678","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}