{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T22:35:21Z","timestamp":1765319721763,"version":"3.46.0"},"reference-count":36,"publisher":"Springer Science and Business Media LLC","issue":"36","license":[{"start":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T00:00:00Z","timestamp":1748304000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T00:00:00Z","timestamp":1748304000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Faculty of Information Technology, University of Science, Vietnam National University - Ho Chi Minh City"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-025-20936-6","type":"journal-article","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T05:55:29Z","timestamp":1748325329000},"page":"44949-44972","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["CMATalk: Cross modality alignment for talking head generation"],"prefix":"10.1007","volume":"84","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3614-7982","authenticated-orcid":false,"given":"Xuan-Nam","family":"Cao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5835-861X","authenticated-orcid":false,"given":"Quoc-Huy","family":"Trinh","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3046-3041","authenticated-orcid":false,"given":"Minh-Triet","family":"Tran","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,27]]},"reference":[{"key":"20936_CR1","doi-asserted-by":"publisher","unstructured":"Wang K, Wu Q, Song L, Yang Z, Wu W, Qian C, He R, Qiao Y, Loy CC (2020) Mead: A large-scale audio-visual dataset for emotional talking-face generation. In: ECCV. https:\/\/doi.org\/10.1007\/978-3-030-58589-1_42","DOI":"10.1007\/978-3-030-58589-1_42"},{"issue":"4","key":"20936_CR2","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1109\/TAFFC.2014.2336244","volume":"5","author":"H Cao","year":"2014","unstructured":"Cao H, Cooper DG, Keutmann MK, Gur RC, Nenkova A, Verma R (2014) Crema-d: Crowd-sourced emotional multimodal actors dataset. IEEE Transact Affect Comput 5(4):377\u2013390. https:\/\/doi.org\/10.1109\/TAFFC.2014.2336244","journal-title":"IEEE Transact Affect Comput"},{"issue":"11\u201312","key":"20936_CR3","doi-asserted-by":"publisher","first-page":"1767","DOI":"10.1007\/s11263-019-01150-y","volume":"127","author":"A Jamaludin","year":"2019","unstructured":"Jamaludin A, Chung JS, Zisserman A (2019) You said that?: Synthesising talking faces from audio. Int J Comput Vis 127(11\u201312):1767\u20131779. https:\/\/doi.org\/10.1007\/s11263-019-01150-y","journal-title":"Int J Comput Vis"},{"key":"20936_CR4","doi-asserted-by":"publisher","unstructured":"Song Y, Zhu J, Li D, Wang A, Qi H (2019) Talking face generation by conditional recurrent adversarial network. In: Proceedings of the twenty-eighth international joint conference on artificial intelligence, IJCAI-19, pp 919\u2013925. International Joint Conferences on Artificial Intelligence Organization. https:\/\/doi.org\/10.24963\/ijcai.2019\/129","DOI":"10.24963\/ijcai.2019\/129"},{"key":"20936_CR5","doi-asserted-by":"crossref","unstructured":"Mittal G, Wang B (2020) Animating face using disentangled audio representations. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 3290\u20133298","DOI":"10.1109\/WACV45572.2020.9093527"},{"key":"20936_CR6","doi-asserted-by":"publisher","unstructured":"Sinha S, Biswas S, Yadav R, Bhowmick B (2022) Emotion-controllable generalized talking face generation. https:\/\/doi.org\/10.48550\/arXiv.2205.01155","DOI":"10.48550\/arXiv.2205.01155"},{"key":"20936_CR7","doi-asserted-by":"publisher","unstructured":"Ji X, Zhou H, Wang K, Wu W, Loy C.C, Cao X, Xu F (2021) Audio-driven emotional video portraits. In: 2021 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 14075\u201314084. https:\/\/doi.org\/10.1109\/CVPR46437.2021.01386","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"20936_CR8","doi-asserted-by":"publisher","unstructured":"Liu X, Xu Y, Wu Q, Zhou H, Wu W, Zhou B (2022) Semantic-aware implicit neural audio-driven video portrait generation. In: European conference on computer vision, pp 106\u2013125. https:\/\/doi.org\/10.1007\/978-3-031-19836-6_7. Springer","DOI":"10.1007\/978-3-031-19836-6_7"},{"key":"20936_CR9","doi-asserted-by":"publisher","unstructured":"Ji X, Zhou H, Wang K, Wu Q, Wu W, Xu F, Cao X (2022) Eamm: One-shot emotional talking face via audio-based emotion-aware motion model. In: ACM SIGGRAPH 2022 conference proceedings, pp 1\u201310. https:\/\/doi.org\/10.1145\/3528233.3530745","DOI":"10.1145\/3528233.3530745"},{"issue":"8","key":"20936_CR10","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"issue":"6","key":"20936_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417774","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou Y, Han X, Shechtman E, Echevarria J, Kalogerakis E, Li D (2020) Makelttalk. ACM Trans Graph 39(6):1\u201315. https:\/\/doi.org\/10.1145\/3414685.3417774","journal-title":"Makelttalk. ACM Trans Graph"},{"key":"20936_CR12","doi-asserted-by":"publisher","unstructured":"Zhou Y, Xu Z, Landreth C, Kalogerakis E, Maji S, Singh K (2018) Visemenet: audio-driven animator-centric speech animation. ACM Trans Graph 37(4). https:\/\/doi.org\/10.1145\/3197517.3201292","DOI":"10.1145\/3197517.3201292"},{"key":"20936_CR13","doi-asserted-by":"publisher","first-page":"585","DOI":"10.1109\/tifs.2022.3146783","volume":"17","author":"L Song","year":"2022","unstructured":"Song L, Wu W, Qian C, He R, Loy CC (2022) Everybody\u2019s talkin\u2019: Let me talk as you want. IEEE Trans Inf Forensic Secur 17:585\u2013598. https:\/\/doi.org\/10.1109\/tifs.2022.3146783","journal-title":"IEEE Trans Inf Forensic Secur"},{"key":"20936_CR14","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01251-8","author":"K Vougioukas","year":"2019","unstructured":"Vougioukas K, Petridis S, Pantic M (2019). Realistic Speech-Driven Facial Animation with GANs. https:\/\/doi.org\/10.1007\/s11263-019-01251-8","journal-title":"Realistic Speech-Driven Facial Animation with GANs."},{"key":"20936_CR15","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3099900","author":"SE Eskimez","year":"2021","unstructured":"Eskimez SE, Zhang Y, Duan Z (2021). Speech driven talking face generation from a single image and an emotion condition. https:\/\/doi.org\/10.1109\/TMM.2021.3099900","journal-title":"Speech driven talking face generation from a single image and an emotion condition."},{"key":"20936_CR16","doi-asserted-by":"publisher","unstructured":"Gong Y, Chung Y-A, Glass J (2021) Ast: Audio spectrogram transformer. https:\/\/doi.org\/10.48550\/arXiv.2104.01778","DOI":"10.48550\/arXiv.2104.01778"},{"key":"20936_CR17","doi-asserted-by":"publisher","unstructured":"Verma P, Berger J Audio Transformers:Transformer Architectures For Large Scale Audio Understanding. Adieu Convolutions (2021). https:\/\/doi.org\/10.48550\/arXiv.2105.00335","DOI":"10.48550\/arXiv.2105.00335"},{"key":"20936_CR18","doi-asserted-by":"crossref","unstructured":"Zhang J, Liu L, Xue Z, Liu Y (2020) Apb2face: Audio-guided face reenactment with auxiliary pose and blink signals. In: ICASSP 2020-2020 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 4402\u20134406. IEEE","DOI":"10.1109\/ICASSP40776.2020.9052977"},{"key":"20936_CR19","doi-asserted-by":"publisher","unstructured":"Lu Y, Chai J, Cao X (2021) Live speech portraits: real-time photorealistic talking-head animation. ACM Trans Graph 40(6). https:\/\/doi.org\/10.1145\/3478513.3480484","DOI":"10.1145\/3478513.3480484"},{"key":"20936_CR20","doi-asserted-by":"publisher","unstructured":"Yi R, Ye Z, Fan R, Shu Y, Liu Y-J, Lai Y-K, Rosin PL (2022) Animating portrait line drawings from a single face photo and a speech signal. In: ACM SIGGRAPH 2022 conference proceedings. SIGGRAPH \u201922. Association for Computing Machinery. https:\/\/doi.org\/10.1145\/3528233.3530720","DOI":"10.1145\/3528233.3530720"},{"key":"20936_CR21","doi-asserted-by":"crossref","unstructured":"Wang J, Zhao Y, Liu L, Xu T, Li Q, Li S (2023) Emotional talking head generation based on memory-sharing and attention-augmented networks. In: INTERSPEECH 2023, p 2. ISCA,","DOI":"10.21437\/Interspeech.2023-749"},{"key":"20936_CR22","doi-asserted-by":"publisher","unstructured":"Chen Y, Yao Y, Li Z, Wang W, Zhang Y, Yang H, Wen X (2024) Hyperlips: hyper control lips with high resolution decoder for talking face generation. Appl Intell 55(2). https:\/\/doi.org\/10.1007\/s10489-024-05914-z","DOI":"10.1007\/s10489-024-05914-z"},{"key":"20936_CR23","doi-asserted-by":"publisher","unstructured":"Ronneberger O, Fischer P, Brox T (2015) U-Net: Convolutional networks for biomedical image segmentation, pp 234\u2013241. Springer. https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"20936_CR24","doi-asserted-by":"publisher","unstructured":"Cao X-N, Trinh Q-H, Tran M-T (2024) Trans-apl: Transformer model for audio and prior landmark fusion for talking landmark generation. In: ICMV. https:\/\/doi.org\/10.1117\/12.3055071","DOI":"10.1117\/12.3055071"},{"key":"20936_CR25","doi-asserted-by":"crossref","unstructured":"Gulati A, Qin J, Chiu C-C, Parmar N, Zhang Y, Yu J, Han W, Wang S, Zhang Z, Wu Y, Pang R (2020) Conformer: Convolution-augmented Transformer for Speech Recognition. arXiv:2005.08100","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"20936_CR26","doi-asserted-by":"crossref","unstructured":"Kong Q, Xu Y, Wang W, Plumbley MD (2020) Sound event detection of weakly labelled data with CNN-transformer and automatic threshold optimization. arXiv:1912.04761","DOI":"10.1109\/TASLP.2020.3014737"},{"key":"20936_CR27","doi-asserted-by":"publisher","unstructured":"Cao X-N, Trinh Q-H, Ho V-S, Tran M-T (2023) Speechsyncnet: Speech to talking landmark via the fusion of prior frame landmark and the audio. In: 2023 IEEE international conference on visual communications and image processing (VCIP), pp 1\u20135. https:\/\/doi.org\/10.1109\/VCIP59821.2023.10402739","DOI":"10.1109\/VCIP59821.2023.10402739"},{"key":"20936_CR28","doi-asserted-by":"publisher","first-page":"221640","DOI":"10.1109\/ACCESS.2020.3043201","volume":"8","author":"MB Er","year":"2020","unstructured":"Er MB (2020) A novel approach for classification of speech emotions based on deep and acoustic features. IEEE Access 8:221640\u2013221653. https:\/\/doi.org\/10.1109\/ACCESS.2020.3043201","journal-title":"IEEE Access"},{"key":"20936_CR29","doi-asserted-by":"publisher","unstructured":"Cao X-N, Trinh Q-H, Do-Nguyen Q-A, Ho V-S, Dang H-T, Tran M-T (2024) Eapc: Emotion and audio prior control framework for the emotional and temporal talking face generation. In: ICAART (2), pp 520\u2013530. https:\/\/doi.org\/10.5220\/0012455700003636","DOI":"10.5220\/0012455700003636"},{"key":"20936_CR30","unstructured":"Kingma DP, Welling M (2022) Auto-encoding variational bayes. arXiv:1312.6114"},{"key":"20936_CR31","doi-asserted-by":"crossref","unstructured":"Joyce JM (2011) Kullback-leibler divergence. In: Lovric, M. (eds) international encyclopedia of statistical science","DOI":"10.1007\/978-3-642-04898-2_327"},{"key":"20936_CR32","doi-asserted-by":"publisher","unstructured":"Lugaresi C, Tang J, Nash H, McClanahan C, Uboweja E, Hays M, Zhang F, Chang C-L, Yong MG, Lee J, Chang W-T, Hua W, Georg M, Grundmann M (2019) MediaPipe: A framework for building perception pipelines. https:\/\/doi.org\/10.48550\/arXiv.1906.08172","DOI":"10.48550\/arXiv.1906.08172"},{"key":"20936_CR33","doi-asserted-by":"publisher","unstructured":"Chen L, Li Z, Maddox R.K, Duan Z, Xu C (2018) Lip movements generation at a glance. In: Computer Vision \u2013 ECCV 2018: 15th European Conference, Munich, Germany, September 8\u201314, 2018, Proceedings, Part VII, pp. 538\u2013553. Springer,. https:\/\/doi.org\/10.1007\/978-3-030-01234-2_32","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"20936_CR34","doi-asserted-by":"publisher","unstructured":"Larkin KG (2015) Structural Similarity Index SSIMplified: Is there really a simpler concept at the heart of image quality measurement?. https:\/\/doi.org\/10.48550\/arXiv.1503.06680","DOI":"10.48550\/arXiv.1503.06680"},{"key":"20936_CR35","doi-asserted-by":"publisher","unstructured":"Heusel M, Ramsauer H, Unterthiner T, Nessler B, Hochreiter S (2017) Gans trained by a two time-scale update rule converge to a local nash equilibrium. In: Proceedings of the 31st international conference on neural information processing systems. NIPS\u201917, pp 6629\u20136640. Curran Associates Inc. https:\/\/doi.org\/10.5555\/3295222.3295408","DOI":"10.5555\/3295222.3295408"},{"key":"20936_CR36","doi-asserted-by":"publisher","unstructured":"Zhong W, Fang C, Cai Y, Wei P, Zhao G, Lin L, Li G (2023) Identity-preserving talking face generation with landmark and appearance priors. In: 2023 IEEE\/CVF conference on computer vision and pattern recognition (CVPR), pp 9729\u20139738. https:\/\/doi.org\/10.1109\/CVPR52729.2023.00938","DOI":"10.1109\/CVPR52729.2023.00938"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-025-20936-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-025-20936-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-025-20936-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T22:34:21Z","timestamp":1765319661000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-025-20936-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,27]]},"references-count":36,"journal-issue":{"issue":"36","published-online":{"date-parts":[[2025,11]]}},"alternative-id":["20936"],"URL":"https:\/\/doi.org\/10.1007\/s11042-025-20936-6","relation":{},"ISSN":["1573-7721"],"issn-type":[{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2025,5,27]]},"assertion":[{"value":"30 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 May 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 May 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 May 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}