{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:23Z","timestamp":1755825023841,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733344","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"1036-1044","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["FREAK: Frequency-modulated High-fidelity and Real-time Audio-driven Talking Portrait Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-5760-5665","authenticated-orcid":false,"given":"Ziqi","family":"Ni","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Southeast University, Nanjing, China and Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications, Ministry of Education, China, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9933-9240","authenticated-orcid":false,"given":"Ao","family":"Fu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Southeast University, Nanjing, China and Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications, Ministry of Education, China, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3021-3229","authenticated-orcid":false,"given":"Yi","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Southeast University, Nanjing, China and Key Laboratory of New Generation Artificial Intelligence Technology and Its Interdisciplinary Applications, Ministry of Education, China, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Lele Chen Zhiheng Li Ross K Maddox Zhiyao Duan and Chenliang Xu. 2018. Lip movements generation at a glance. In ECCV. 520--535.","DOI":"10.1007\/978-3-030-01234-2_32"},{"volume-title":"SSD-GAN: Measuring the Realness in the Spatial and Spectral Domains. In AAAI Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:228083505","author":"Chen Yuanqi","key":"e_1_3_2_1_2_1","unstructured":"Yuanqi Chen, Ge Li, Cece Jin, Shan Liu, and Thomas H. Li. 2020. SSD-GAN: Measuring the Realness in the Spatial and Spectral Domains. In AAAI Conference on Artificial Intelligence. https:\/\/api.semanticscholar.org\/CorpusID:228083505"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54427-4_19"},{"key":"e_1_3_2_1_4_1","volume-title":"Rethinking Bottleneck Structure for Efficient Mobile Network Design. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:220363927","author":"Daquan Zhou","year":"2020","unstructured":"Zhou Daquan, Qibin Hou, Yunpeng Chen, Jiashi Feng, and Shuicheng Yan. 2020. Rethinking Bottleneck Structure for Efficient Mobile Network Design. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:220363927"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613753"},{"key":"e_1_3_2_1_7_1","volume-title":"Dual Audio-Centric Modality Coupling for Talking Head Generation. arXiv preprint arXiv:2503.22728","author":"Fu Ao","year":"2025","unstructured":"Ao Fu, Ziqi Ni, and Yi Zhou. 2025. Dual Audio-Centric Modality Coupling for Talking Head Generation. arXiv preprint arXiv:2503.22728 (2025)."},{"key":"e_1_3_2_1_8_1","unstructured":"Ning Gao Xingyu Jiang Xiuhui Zhang and Yue Deng. [n. d.]. Efficient Frequency-Domain Image Deraining with Contrastive Regularization. ( [n. d.])."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3607541.3616812"},{"key":"e_1_3_2_1_10_1","volume-title":"Adaptive fourier neural operators: Efficient token mixers for transformers. arXiv preprint arXiv:2111.13587","author":"Guibas John","year":"2021","unstructured":"John Guibas, Morteza Mardani, Zongyi Li, Andrew Tao, Anima Anandkumar, and Bryan Catanzaro. 2021. Adaptive fourier neural operators: Efficient token mixers for transformers. arXiv preprint arXiv:2111.13587 (2021)."},{"key":"e_1_3_2_1_11_1","volume-title":"Ad-nerf: Audio driven neural radiance fields for talking head synthesis. In ICCV. 5784--5794.","author":"Guo Yudong","year":"2021","unstructured":"Yudong Guo, Keyu Chen, Sen Liang, Yong-Jin Liu, Hujun Bao, and Juyong Zhang. 2021. Ad-nerf: Audio driven neural radiance fields for talking head synthesis. In ICCV. 5784--5794."},{"key":"e_1_3_2_1_12_1","volume-title":"NeurIPS","volume":"30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. NeurIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_1_13_1","volume-title":"Kushal Lakhotia","author":"Hsu Wei-Ning","year":"2021","unstructured":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio, speech, and language processing, Vol. 29 (2021), 3451--3460."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Zhipeng Huang Zhizheng Zhang Cuiling Lan Zheng-Jun Zha Yan Lu and Baining Guo. 2023. Adaptive frequency filters as efficient global token mixers. In ICCV. 6049--6059.","DOI":"10.1109\/ICCV51070.2023.00556"},{"key":"e_1_3_2_1_15_1","volume-title":"Loopy: Taming audio-driven portrait avatar with long-term motion dependency. arXiv preprint arXiv:2409.02634","author":"Jiang Jianwen","year":"2024","unstructured":"Jianwen Jiang, Chao Liang, Jiaqi Yang, Gaojie Lin, Tianyun Zhong, and Yanbo Zheng. 2024. Loopy: Taming audio-driven portrait avatar with long-term motion dependency. arXiv preprint arXiv:2409.02634 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Liming Jiang Bo Dai Wayne Wu and Chen Change Loy. 2021. Focal frequency loss for image reconstruction and synthesis. In ICCV. 13919--13929.","DOI":"10.1109\/ICCV48922.2021.01366"},{"volume-title":"Perceptual losses for real-time style transfer and super-resolution","author":"Johnson Justin","key":"e_1_3_2_1_17_1","unstructured":"Justin Johnson, Alexandre Alahi, and Li Fei-Fei. 2016. Perceptual losses for real-time style transfer and super-resolution. In ECCV. Springer, 694--711."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00381"},{"key":"e_1_3_2_1_19_1","unstructured":"Jiahe Li Jiawei Zhang Xiao Bai Jun Zhou and Lin Gu. 2023. Efficient region-aware neural radiance fields for high-fidelity talking portrait synthesis. In ICCV. 7568--7578."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Shiqi Lin Zhizheng Zhang Zhipeng Huang Yan Lu Cuiling Lan Peng Chu Quanzeng You Jiang Wang Zicheng Liu Amey Parulkar et al. 2023. Deep frequency filtering for domain generalization. In CVPR. 11797--11807.","DOI":"10.1109\/CVPR52729.2023.01135"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480484"},{"key":"e_1_3_2_1_22_1","volume-title":"Styletalk: One-shot talking head generation with controllable speaking styles. arXiv preprint arXiv:2301.01081","author":"Ma Yifeng","year":"2023","unstructured":"Yifeng Ma, Suzhen Wang, Zhipeng Hu, Changjie Fan, Tangjie Lv, Yu Ding, Zhidong Deng, and Xin Yu. 2023. Styletalk: One-shot talking head generation with controllable speaking styles. arXiv preprint arXiv:2301.01081 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Shenoy Pratik Gurudatt","author":"Masi Iacopo","year":"2020","unstructured":"Iacopo Masi, Aditya Killekar, Royston Marian Mascarenhas, Shenoy Pratik Gurudatt, and Wael AbdAlmageed. 2020. Two-branch Recurrent Network for Isolating Deepfakes in Videos. ArXiv, Vol. abs\/2008.03412 (2020). https:\/\/api.semanticscholar.org\/CorpusID:221090663"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00521"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/556016"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"volume-title":"On the Spectral Bias of Neural Networks. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:53012119","author":"Rahaman Nasim","key":"e_1_3_2_1_28_1","unstructured":"Nasim Rahaman, Aristide Baratin, Devansh Arpit, Felix Dr\u00e4xler, Min Lin, Fred A. Hamprecht, Yoshua Bengio, and Aaron C. Courville. 2018. On the Spectral Bias of Neural Networks. In International Conference on Machine Learning. https:\/\/api.semanticscholar.org\/CorpusID:53012119"},{"key":"e_1_3_2_1_29_1","first-page":"980","article-title":"Global filter networks for image classification","volume":"34","author":"Rao Yongming","year":"2021","unstructured":"Yongming Rao, Wenliang Zhao, Zheng Zhu, Jiwen Lu, and Jie Zhou. 2021. Global filter networks for image classification. NeurIPS, Vol. 34 (2021), 980--993.","journal-title":"NeurIPS"},{"volume-title":"Learning dynamic facial radiance fields for few-shot talking head synthesis","author":"Shen Shuai","key":"e_1_3_2_1_30_1","unstructured":"Shuai Shen, Wanhua Li, Zheng Zhu, Yueqi Duan, Jie Zhou, and Jiwen Lu. 2022. Learning dynamic facial radiance fields for few-shot talking head synthesis. In ECCV. Springer, 666--682."},{"key":"e_1_3_2_1_31_1","volume-title":"Difftalk: Crafting diffusion models for generalized talking head synthesis. arXiv preprint arXiv:2301.03786","author":"Shen Shuai","year":"2023","unstructured":"Shuai Shen, Wenliang Zhao, Zibin Meng, Wanhua Li, Zheng Zhu, Jie Zhou, and Jiwen Lu. 2023. Difftalk: Crafting diffusion models for generalized talking head synthesis. arXiv preprint arXiv:2301.03786 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Very Deep Convolutional Networks for Large-Scale Image Recognition. CoRR","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very Deep Convolutional Networks for Large-Scale Image Recognition. CoRR, Vol. abs\/1409.1556 (2014). https:\/\/api.semanticscholar.org\/CorpusID:14124313"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2022.3146783"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00502"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555393"},{"key":"e_1_3_2_1_36_1","volume-title":"Frequency-Aware Deepfake Detection: Improving Generalizability through Frequency Space Learning. ArXiv","author":"Tan Chuangchuang","year":"2024","unstructured":"Chuangchuang Tan, Yao Zhao, Shikui Wei, Guanghua Gu, Ping Liu, and Yunchao Wei. 2024. Frequency-Aware Deepfake Detection: Improving Generalizability through Frequency Space Learning. ArXiv, Vol. abs\/2403.07240 (2024). https:\/\/api.semanticscholar.org\/CorpusID:268890333"},{"key":"e_1_3_2_1_37_1","volume-title":"Real-time neural radiance talking portrait synthesis via audio-spatial decomposition. arXiv preprint arXiv:2211.12368","author":"Tang Jiaxiang","year":"2022","unstructured":"Jiaxiang Tang, Kaisiyuan Wang, Hang Zhou, Xiaokang Chen, Dongliang He, Tianshu Hu, Jingtuo Liu, Gang Zeng, and Jingdong Wang. 2022. Real-time neural radiance talking portrait synthesis via audio-spatial decomposition. arXiv preprint arXiv:2211.12368 (2022)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i14.29457"},{"key":"e_1_3_2_1_39_1","volume-title":"EMO: Emote Portrait Alive Generating Expressive Portrait Videos with Audio2Video Diffusion Model Under Weak Conditions","author":"Tian Linrui","year":"2025","unstructured":"Linrui Tian, Qi Wang, Bang Zhang, and Liefeng Bo. 2025. EMO: Emote Portrait Alive Generating Expressive Portrait Videos with Audio2Video Diffusion Model Under Weak Conditions. In ECCV. Springer, 244--260."},{"key":"e_1_3_2_1_40_1","volume-title":"FVD: A new Metric for Video Generation. In DGS@ICLR. https:\/\/api.semanticscholar.org\/CorpusID:198489709","author":"Unterthiner Thomas","year":"2019","unstructured":"Thomas Unterthiner, Sjoerd van Steenkiste, Karol Kurach, Rapha\u00ebl Marinier, Marcin Michalski, and Sylvain Gelly. 2019. FVD: A new Metric for Video Generation. In DGS@ICLR. https:\/\/api.semanticscholar.org\/CorpusID:198489709"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Fei Wang Dan Guo Kun Li Zhun Zhong and Meng Wang. 2024. Frequency decoupling for motion magnification via multi-level isomorphic architecture. In CVPR. 18984--18994.","DOI":"10.1109\/CVPR52733.2024.01796"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Jiadong Wang Xinyuan Qian Malu Zhang Robby T Tan and Haizhou Li. 2023. Seeing What You Said: Talking Face Generation Guided by a Lip Reading Expert. In CVPR. 14653--14662.","DOI":"10.1109\/CVPR52729.2023.01408"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531415"},{"key":"e_1_3_2_1_44_1","volume-title":"Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694","author":"Wei Huawei","year":"2024","unstructured":"Huawei Wei, Zejun Yang, and Zhisheng Wang. 2024. Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694 (2024)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681108"},{"key":"e_1_3_2_1_46_1","volume-title":"Hallo: Hierarchical audio-driven visual synthesis for portrait image animation. arXiv preprint arXiv:2406.08801","author":"Xu Mingwang","year":"2024","unstructured":"Mingwang Xu, Hui Li, Qingkun Su, Hanlin Shang, Liwei Zhang, Ce Liu, Jingdong Wang, Yao Yao, and Siyu Zhu. 2024. Hallo: Hierarchical audio-driven visual synthesis for portrait image animation. arXiv preprint arXiv:2406.08801 (2024)."},{"key":"e_1_3_2_1_47_1","unstructured":"Dogucan Yaman Fevziye Irem Eyiokur Leonard B\u00e4rmann Hazim Kemal Ekenel and Alexander Waibel. [n. d.]. Audio-driven Talking Face Generation with Stabilized Synchronization Loss. ( [n. d.])."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547781"},{"key":"e_1_3_2_1_49_1","unstructured":"Jiahui Yu Zhe Lin Jimei Yang Xiaohui Shen Xin Lu and Thomas S Huang. 2019. Free-form image inpainting with gated convolution. In ICCV. 4471--4480."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Wenxuan Zhang Xiaodong Cun Xuan Wang Yong Zhang Xi Shen Yu Guo Ying Shan and Fei Wang. 2023a. SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation. In CVPR. 8652--8661.","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"e_1_3_2_1_51_1","volume-title":"Davd-net: Deep audio-aided video decompression of talking heads. In CVPR. 12335--12344.","author":"Zhang Xi","year":"2020","unstructured":"Xi Zhang, Xiaolin Wu, Xinliang Zhai, Xianye Ben, and Chengjie Tu. 2020. Davd-net: Deep audio-aided video decompression of talking heads. In CVPR. 12335--12344."},{"key":"e_1_3_2_1_52_1","volume-title":"MuseTalk: Real-Time High Quality Lip Synchronization with Latent Space Inpainting. arXiv preprint arXiv:2410.10122","author":"Zhang Yue","year":"2024","unstructured":"Yue Zhang, Minhao Liu, Zhaokang Chen, Bin Wu, Yubin Zeng, Chao Zhan, Yingjie He, Junxin Huang, and Wenjiang Zhou. 2024. MuseTalk: Real-Time High Quality Lip Synchronization with Latent Space Inpainting. arXiv preprint arXiv:2410.10122 (2024)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25464"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Weizhi Zhong Chaowei Fang Yinqi Cai Pengxu Wei Gangming Zhao Liang Lin and Guanbin Li. 2023. Identity-Preserving Talking Face Generation with Landmark and Appearance Priors. In CVPR. 9729--9738.","DOI":"10.1109\/CVPR52729.2023.00938"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3414685.3417774","article-title":"Makelttalk: speaker-aware talking-head animation","volume":"39","author":"Zhou Yang","year":"2020","unstructured":"Yang Zhou, Xintong Han, Eli Shechtman, Jose Echevarria, Evangelos Kalogerakis, and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions On Graphics (TOG), Vol. 39, 6 (2020), 1--15.","journal-title":"ACM Transactions On Graphics (TOG)"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733344","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:13:16Z","timestamp":1755749596000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733344"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":55,"alternative-id":["10.1145\/3731715.3733344","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733344","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}