{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T08:07:27Z","timestamp":1761898047054,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681386","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"8149-8158","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["SyncTalklip: Highly Synchronized Lip-Readable Speaker Generation with Multi-Task Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7297-4536","authenticated-orcid":false,"given":"Xiaoda","family":"Yang","sequence":"first","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9708-3225","authenticated-orcid":false,"given":"Xize","family":"Cheng","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7682-7678","authenticated-orcid":false,"given":"Dongjie","family":"Fu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6488-9695","authenticated-orcid":false,"given":"Minghui","family":"Fang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6876-9943","authenticated-orcid":false,"given":"Jialung","family":"Zuo","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0129-4843","authenticated-orcid":false,"given":"Shengpeng","family":"Ji","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6121-0384","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3564-1628","authenticated-orcid":false,"given":"Jin","family":"Tao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2018.2889052"},{"key":"e_1_3_2_1_2_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Afouras Triantafyllos","year":"2018","unstructured":"Triantafyllos Afouras, Joon Son Chung, and Andrew Zisserman. 2018. LRS3-TED: a large-scale dataset for visual speech recognition. arXiv preprint arXiv:1809.00496 (2018)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2019.00802"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2019.00802"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Xize Cheng Rongjie Huang Linjun Li Tao Jin Zehan Wang Aoxiong Yin Minglei Li Xinyu Duan Zhou Zhao et al. 2023. TransFace: Unit-Based Audio-Visual Speech Synthesizer for Talking Head Translation. arXiv preprint arXiv:2312.15197 (2023).","DOI":"10.18653\/v1\/2024.findings-acl.593"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01442"},{"key":"e_1_3_2_1_7_1","volume-title":"Opensr: Open-modality speech recognition via maintaining multi-modality alignment. arXiv preprint arXiv:2306.06410","author":"Cheng Xize","year":"2023","unstructured":"Xize Cheng, Tao Jin, Linjun Li, Wang Lin, Xinyu Duan, and Zhou Zhao. 2023. Opensr: Open-modality speech recognition via maintaining multi-modality alignment. arXiv preprint arXiv:2306.06410 (2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"e_1_3_2_1_9_1","volume-title":"Generative adversarial networks: An overview","author":"Creswell Antonia","year":"2018","unstructured":"Antonia Creswell, Tom White, Vincent Dumoulin, Kai Arulkumaran, Biswa Sengupta, and Anil A Bharath. 2018. Generative adversarial networks: An overview. IEEE signal processing magazine 35, 1 (2018), 53--65."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"James W Dias Theresa C Cook and Lawrence D Rosenblum. 2017. The McGurk effect and the primacy of multisensory perception. (2017).","DOI":"10.1093\/acprof:oso\/9780199794607.003.0115"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_2_1_12_1","volume-title":"ACE: A Generative Cross-Modal Retrieval Framework with Coarse-To-Fine Semantic Modeling. arXiv preprint arXiv:2406.17507","author":"Fang Minghui","year":"2024","unstructured":"Minghui Fang, Shengpeng Ji, Jialong Zuo, Hai Huang, Yan Xia, Jieming Zhu, Xize Cheng, Xiaoda Yang, Wenrui Liu, Gang Wang, et al. 2024. ACE: A Generative Cross-Modal Retrieval Framework with Coarse-To-Fine Semantic Modeling. arXiv preprint arXiv:2406.17507 (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681347"},{"key":"e_1_3_2_1_14_1","unstructured":"IanJ Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville Yoshua Bengio and Delhi Delhi. [n. d.]. Generative Adversarial Nets. ([n. d.])."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6717"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_2_1_17_1","volume-title":"Av-transpeech: Audio-visual robust speech-to-speech translation. arXiv preprint arXiv:2305.15403","author":"Huang Rongjie","year":"2023","unstructured":"Rongjie Huang, Huadai Liu, Xize Cheng, Yi Ren, Linjun Li, Zhenhui Ye, Jinzheng He, Lichao Zhang, Jinglin Liu, Xiang Yin, et al. 2023. Av-transpeech: Audio-visual robust speech-to-speech translation. arXiv preprint arXiv:2305.15403 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Multimodal human--computer interaction: A survey. Computer vision and image understanding 108, 1--2","author":"Jaimes Alejandro","year":"2007","unstructured":"Alejandro Jaimes and Nicu Sebe. 2007. Multimodal human--computer interaction: A survey. Computer vision and image understanding 108, 1--2 (2007), 116--134."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01150-y"},{"key":"e_1_3_2_1_20_1","volume-title":"Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models. arXiv preprint arXiv:2402.12208","author":"Ji Shengpeng","year":"2024","unstructured":"Shengpeng Ji, Minghui Fang, Ziyue Jiang, Rongjie Huang, Jialung Zuo, Shulei Wang, and Zhou Zhao. 2024. Language-Codec: Reducing the Gaps Between Discrete Codec Representation and Speech Language Models. arXiv preprint arXiv:2402.12208 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612291"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681387"},{"key":"e_1_3_2_1_23_1","volume-title":"Deep video portraits. ACM transactions on graphics (TOG) 37, 4","author":"Kim Hyeongwoo","year":"2018","unstructured":"Hyeongwoo Kim, Pablo Garrido, Ayush Tewari, Weipeng Xu, Justus Thies, Matthias Niessner, Patrick P\u00e9rez, Christian Richardt, Michael Zollh\u00f6fer, and Christian Theobalt. 2018. Deep video portraits. ACM transactions on graphics (TOG) 37, 4 (2018), 1--14."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the 27th ACM international conference on multimedia. 1428--1436","author":"Rudrabha Mukhopadhyay Prajwal KR","year":"2019","unstructured":"Prajwal KR, Rudrabha Mukhopadhyay, Jerin Philip, Abhishek Jha, Vinay Namboodiri, and CV Jawahar. 2019. Towards automatic face-to-face translation. In Proceedings of the 27th ACM international conference on multimedia. 1428--1436."},{"key":"e_1_3_2_1_25_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems 25","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems 25 (2012)."},{"key":"e_1_3_2_1_26_1","volume-title":"Alexandre De Brebisson, and Yoshua Bengio","author":"Kumar Rithesh","year":"2017","unstructured":"Rithesh Kumar, Jose Sotelo, Kundan Kumar, Alexandre De Brebisson, and Yoshua Bengio. 2017. Obamanet: Photo-realistic lip-sync from text. arXiv preprint arXiv:1801.01442 (2017)."},{"key":"e_1_3_2_1_27_1","unstructured":"Borong Liang Yan Pan Zhizhi Guo Hang Zhou 'Zhibin Hong Xiaoguang Han Junyu Han Jingtuo Liu Errui Ding and Jingdong Wang. [n. d.]. Expressive Talking Head Generation with Granular Audio-Visual Control. ([n. d.])."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/icassp39728.2021.9414567"},{"volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 6707--6717","author":"Misra Ishan","key":"e_1_3_2_1_29_1","unstructured":"Ishan Misra and Laurens van der Maaten. 2020. Self-supervised learning of pretext-invariant representations. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 6707--6717."},{"key":"e_1_3_2_1_30_1","volume-title":"Joon Son Chung, and Andrew Zisserman","author":"Nagrani Arsha","year":"2017","unstructured":"Arsha Nagrani, Joon Son Chung, and Andrew Zisserman. 2017. Voxceleb: a large-scale speaker identification dataset. arXiv preprint arXiv:1706.08612 (2017)."},{"key":"e_1_3_2_1_31_1","unstructured":"Jin Park Minsu Kim Joanna Hong Jeongsoo Choi and YongMan Ro. [n. d.]. SyncTalkFace: Talking Face Generation with Precise Lip-syncing via Audio-Lip Memory. ([n. d.])."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_33_1","series-title":"Lecture Notes in Computer Science,Lecture Notes in Computer Science (Jan","volume-title":"U-Net: Convolutional Networks for Biomedical Image Segmentation","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-Net: Convolutional Networks for Biomedical Image Segmentation. Lecture Notes in Computer Science,Lecture Notes in Computer Science (Jan 2015)."},{"key":"e_1_3_2_1_34_1","volume-title":"Learning audio-visual speech representation by masked multimodal cluster prediction. arXiv preprint arXiv:2201.02184","author":"Shi Bowen","year":"2022","unstructured":"Bowen Shi, Wei-Ning Hsu, Kushal Lakhotia, and Abdelrahman Mohamed. 2022. Learning audio-visual speech representation by masked multimodal cluster prediction. arXiv preprint arXiv:2201.02184 (2022)."},{"key":"e_1_3_2_1_35_1","volume-title":"Talking face generation by conditional recurrent adversarial network. arXiv preprint arXiv:1804.04786","author":"Song Yang","year":"2018","unstructured":"Yang Song, Jingwen Zhu, Dawei Li, XiaolongWang, and Hairong Qi. 2018. Talking face generation by conditional recurrent adversarial network. arXiv preprint arXiv:1804.04786 (2018)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073640"},{"key":"e_1_3_2_1_37_1","volume-title":"Resnet in resnet: Generalizing residual architectures. arXiv preprint arXiv:1603.08029","author":"Targ Sasha","year":"2016","unstructured":"Sasha Targ, Diogo Almeida, and Kevin Lyman. 2016. Resnet in resnet: Generalizing residual architectures. arXiv preprint arXiv:1603.08029 (2016)."},{"key":"e_1_3_2_1_38_1","volume-title":"Attention is All you Need. Neural Information Processing Systems,Neural Information Processing Systems (Jun","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, AidanN. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. Neural Information Processing Systems,Neural Information Processing Systems (Jun 2017)."},{"key":"e_1_3_2_1_39_1","volume-title":"Realistic Speech-Driven Facial Animation with GANs","author":"Vougioukas Konstantinos","year":"2019","unstructured":"Konstantinos Vougioukas, Stavros Petridis, and Maja Pantic. 2019. Realistic Speech-Driven Facial Animation with GANs. Cornell University - arXiv, Cornell University - arXiv (Jun 2019)."},{"key":"e_1_3_2_1_40_1","volume-title":"International conference on machine learning. PMLR","author":"Wang Hua","year":"2014","unstructured":"Hua Wang, Feiping Nie, and Heng Huang. 2014. Robust distance metric learning via simultaneous l1-norm minimization and maximization. In International conference on machine learning. PMLR, 1836--1844."},{"key":"e_1_3_2_1_41_1","volume-title":"Predict-and-Update Network: Audio-Visual Speech Recognition Inspired by Human Speech Perception. (Sep","author":"Wang Jiadong","year":"2022","unstructured":"Jiadong Wang, Xinyuan Qian, and Haizhou Li. 2022. Predict-and-Update Network: Audio-Visual Speech Recognition Inspired by Human Speech Perception. (Sep 2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01408"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2003.819861"},{"volume-title":"Forty-first International Conference on Machine Learning.","author":"Wang Zehan","key":"e_1_3_2_1_44_1","unstructured":"Zehan Wang, Ziang Zhang, Xize Cheng, Rongjie Huang, Luping Liu, Zhenhui Ye, Haifeng Huang, Yang Zhao, Tao Jin, Peng Gao, et al. [n. d.]. FreeBind: Free Lunch in Unified Multimodal Space via Knowledge Fusion. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_45_1","unstructured":"Zehan Wang Ziang Zhang Xize Cheng Rongjie Huang Luping Liu Zhenhui Ye Haifeng Huang Yang Zhao Tao Jin Peng Gao et al. 2024. Molecule-Space: Free Lunch in Unified Multimodal Space via Knowledge Fusion. arXiv preprint arXiv:2405.04883 (2024)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Zehan Wang Ziang Zhang Hang Zhang Luping Liu Rongjie Huang Xize Cheng Hengshuang Zhao and Zhou Zhao. 2024. OmniBind: Large-scale Omni Multimodal Representation via Binding Spaces. arXiv:2407.11895 [cs.CV] https:\/\/arxiv.org\/abs\/2407.11895","DOI":"10.1109\/TVT.2024.3374516"},{"key":"e_1_3_2_1_47_1","first-page":"22099","article-title":"Connecting multimodal contrastive representations","volume":"36","author":"Wang Zehan","year":"2023","unstructured":"Zehan Wang, Yang Zhao, Haifeng Huang, Jiageng Liu, Aoxiong Yin, Li Tang, Linjun Li, Yongqi Wang, Ziang Zhang, and Zhou Zhao. 2023. Connecting multimodal contrastive representations. Advances in Neural Information Processing Systems 36 (2023), 22099--22114.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1989.1.2.270"},{"key":"e_1_3_2_1_49_1","unstructured":"Zunnan Xu Yukang Lin Haonan Han Sicheng Yang Ronghui Li Yachao Zhang and Xiu Li. 2024. MambaTalk: Efficient Holistic Gesture Synthesis with Selective State Space Models. arXiv:2403.09471 [cs.CV] https:\/\/arxiv.org\/abs\/2403.09471"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681264"},{"key":"e_1_3_2_1_51_1","volume-title":"Audio-driven Talking Face Video Generation with Learning-based Personalized Head Pose","author":"Yi Ran","year":"2020","unstructured":"Ran Yi, Zipeng Ye, Jie Zhang, Hujun Bao, and Yong-Jin Liu. 2020. Audio-driven Talking Face Video Generation with Learning-based Personalized Head Pose. Cornell University - arXiv,Cornell University - arXiv (Feb 2020)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/tci.2016.2644865"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019299"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417774"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681386","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681386","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:44Z","timestamp":1750295864000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681386"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":55,"alternative-id":["10.1145\/3664647.3681386","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681386","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}