{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T19:08:58Z","timestamp":1762110538306,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":83,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729126"},{"type":"electronic","value":"9783031729133"}],"license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72913-3_27","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:46:33Z","timestamp":1733089593000},"page":"484-503","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["DIM: Dyadic Interaction Modeling for\u00a0Social Behavior Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2391-3563","authenticated-orcid":false,"given":"Minh","family":"Tran","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0281-8896","authenticated-orcid":false,"given":"Di","family":"Chang","sequence":"additional","affiliation":[]},{"given":"Maksim","family":"Siniukov","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5873-1434","authenticated-orcid":false,"given":"Mohammad","family":"Soleymani","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"27_CR1","doi-asserted-by":"crossref","unstructured":"Ahuja, C., Ma, S., Morency, L.P., Sheikh, Y.: To react or not to react: end-to-end visual pose forecasting for personalized avatar during dyadic conversations. In: 2019 International Conference on Multimodal Interaction, pp. 74\u201384 (2019)","DOI":"10.1145\/3340555.3353725"},{"key":"27_CR2","doi-asserted-by":"publisher","unstructured":"Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.): Computer Vision \u2013 ECCV 2022. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19769-7","DOI":"10.1007\/978-3-031-19769-7"},{"key":"27_CR3","doi-asserted-by":"crossref","unstructured":"Bohus, D., Horvitz, E.: Facilitating multiparty dialog with gaze, gesture, and speech. In: International Conference on Multimodal Interfaces and the Workshop on Machine Learning for Multimodal Interaction, pp.\u00a01\u20138 (2010)","DOI":"10.1145\/1891903.1891910"},{"key":"27_CR4","doi-asserted-by":"crossref","unstructured":"Cao, C., Wu, H., Weng, Y., Shao, T., Zhou, K.: Real-time facial animation with image-based dynamic avatars. ACM Trans. Graph. 35(4) (2016)","DOI":"10.1145\/2897824.2925873"},{"issue":"4","key":"27_CR5","doi-asserted-by":"publisher","first-page":"1283","DOI":"10.1145\/1095878.1095881","volume":"24","author":"Y Cao","year":"2005","unstructured":"Cao, Y., Tien, W.C., Faloutsos, P., Pighin, F.: Expressive speech-driven facial animation. ACM Trans. Graph. 24(4), 1283\u20131302 (2005)","journal-title":"ACM Trans. Graph."},{"issue":"3","key":"27_CR6","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1109\/TAFFC.2016.2545650","volume":"8","author":"A Cerekovic","year":"2016","unstructured":"Cerekovic, A., Aran, O., Gatica-Perez, D.: Rapport with virtual agents: what do human social cues and personality explain? IEEE Trans. Affect. Comput. 8(3), 382\u2013395 (2016)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"27_CR7","doi-asserted-by":"crossref","unstructured":"Chang, Z., Hu, W., Yang, Q., Zheng, S.: Hierarchical semantic perceptual listener head video generation: a high-performance pipeline. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 9581\u20139585 (2023)","DOI":"10.1145\/3581783.3612869"},{"key":"27_CR8","doi-asserted-by":"crossref","unstructured":"Chen, L., Cao, C., De\u00a0la Torre, F., Saragih, J., Xu, C., Sheikh, Y.: High-fidelity face tracking for ar\/vr via deep lighting adaptation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13059\u201313069 (2021)","DOI":"10.1109\/CVPR46437.2021.01286"},{"key":"27_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/978-3-030-58545-7_3","volume-title":"Computer Vision \u2013 ECCV 2020","author":"L Chen","year":"2020","unstructured":"Chen, L., et al.: Talking-head generation with rhythmic head motion. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12354, pp. 35\u201351. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58545-7_3"},{"key":"27_CR10","doi-asserted-by":"crossref","unstructured":"Chen, L., Li, Z., Maddox, R.K., Duan, Z., Xu, C.: Lip movements generation at a glance. In: Proceedings of the European Conference on Computer Vision, pp. 520\u2013535 (2018)","DOI":"10.1007\/978-3-030-01234-2_32"},{"key":"27_CR11","doi-asserted-by":"crossref","unstructured":"Chen, L., Maddox, R.K., Duan, Z., Xu, C.: Hierarchical cross-modal talking face generation with dynamic pixel-wise loss. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7832\u20137841 (2019)","DOI":"10.1109\/CVPR.2019.00802"},{"key":"27_CR12","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"Chu, H., Li, D., Fidler, S.: A face-to-face neural conversation model. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7113\u20137121 (2018)","DOI":"10.1109\/CVPR.2018.00743"},{"key":"27_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"251","DOI":"10.1007\/978-3-319-54427-4_19","volume-title":"Computer Vision \u2013 ACCV 2016 Workshops","author":"JS Chung","year":"2017","unstructured":"Chung, J.S., Zisserman, A.: Out of time: automated lip sync in the wild. In: Chen, C.-S., Lu, J., Ma, K.-K. (eds.) ACCV 2016. LNCS, vol. 10117, pp. 251\u2013263. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-54427-4_19"},{"key":"27_CR15","doi-asserted-by":"crossref","unstructured":"Cudeiro, D., Bolkart, T., Laidlaw, C., Ranjan, A., Black, M.J.: Capture, learning, and synthesis of 3d speaking styles. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 10101\u201310111 (2019)","DOI":"10.1109\/CVPR.2019.01034"},{"key":"27_CR16","doi-asserted-by":"crossref","unstructured":"Danecek, R., Black, M.J., Bolkart, T.: EMOCA: emotion driven monocular face capture and animation. In: Conference on Computer Vision and Pattern Recognition (CVPR), pp. 20311\u201320322 (2022)","DOI":"10.1109\/CVPR52688.2022.01967"},{"key":"27_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"408","DOI":"10.1007\/978-3-030-58577-8_25","volume-title":"Computer Vision \u2013 ECCV 2020","author":"D Das","year":"2020","unstructured":"Das, D., Biswas, S., Sinha, S., Bhowmick, B.: Speech-driven facial animation using cascaded GANs for learning of motion and texture. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 408\u2013424. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_25"},{"key":"27_CR18","volume-title":"Interpersonal Communication Book, The, 13\/E","author":"JA DeVito","year":"2013","unstructured":"DeVito, J.A.: Interpersonal Communication Book, The, 13\/E. Pearson, London (2013)"},{"key":"27_CR19","doi-asserted-by":"crossref","unstructured":"Edwards, P., Landreth, C., Fiume, E., Singh, K.: Jali: an animator-centric viseme model for expressive lip synchronization. ACM Trans. Graph. 35(4), 1\u201311 (2016)","DOI":"10.1145\/2897824.2925984"},{"key":"27_CR20","doi-asserted-by":"publisher","first-page":"3480","DOI":"10.1109\/TMM.2021.3099900","volume":"24","author":"SE Eskimez","year":"2021","unstructured":"Eskimez, S.E., Zhang, Y., Duan, Z.: Speech driven talking face generation from a single image and an emotion condition. IEEE Trans. Multimedia 24, 3480\u20133490 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"27_CR21","doi-asserted-by":"crossref","unstructured":"Fan, B., Wang, L., Soong, F.K., Xie, L.: Photo-real talking head with deep bidirectional lstm. In: IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 4884\u20134888. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178899"},{"key":"27_CR22","doi-asserted-by":"crossref","unstructured":"Fan, Y., Lin, Z., Saito, J., Wang, W., Komura, T.: Faceformer: speech-driven 3d facial animation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18770\u201318780 (2022)","DOI":"10.1109\/CVPR52688.2022.01821"},{"issue":"6","key":"27_CR23","doi-asserted-by":"publisher","first-page":"591","DOI":"10.1109\/TMM.2010.2052239","volume":"12","author":"G Fanelli","year":"2010","unstructured":"Fanelli, G., Gall, J., Romsdorfer, H., Weise, T., Van Gool, L.: A 3-d audio-visual corpus of affective communication. IEEE Trans. Multimedia 12(6), 591\u2013598 (2010)","journal-title":"IEEE Trans. Multimedia"},{"key":"27_CR24","doi-asserted-by":"publisher","unstructured":"Feng, Y., Feng, H., Black, M.J., Bolkart, T.: Learning an animatable detailed 3D face model from in-the-wild images. ACM Trans. Graph. 40(8) (2021). https:\/\/doi.org\/10.1145\/3450626.3459936","DOI":"10.1145\/3450626.3459936"},{"issue":"4","key":"27_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3306346.3323028","volume":"38","author":"O Fried","year":"2019","unstructured":"Fried, O., et al.: Text-based editing of talking-head video. ACM Trans. Graph. 38(4), 1\u201314 (2019)","journal-title":"ACM Trans. Graph."},{"key":"27_CR26","unstructured":"Geng, S., Teotia, R., Tendulkar, P., Menon, S., Vondrick, C.: Affective faces for goal-driven dyadic communication. arXiv preprint arXiv:2301.10939 (2023)"},{"key":"27_CR27","unstructured":"Gong, Y., et al.: Contrastive audio-visual masked autoencoder. arXiv preprint arXiv:2210.07839 (2022)"},{"key":"27_CR28","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1007\/978-3-540-74997-4_12","volume-title":"Intelligent Virtual Agents","author":"J Gratch","year":"2007","unstructured":"Gratch, J., Wang, N., Gerten, J., Fast, E., Duffy, R.: Creating rapport with virtual agents. In: Pelachaud, C., Martin, J.-C., Andr\u00e9, E., Chollet, G., Karpouzis, K., Pel\u00e9, D. (eds.) IVA 2007. LNCS (LNAI), vol. 4722, pp. 125\u2013138. Springer, Heidelberg (2007). https:\/\/doi.org\/10.1007\/978-3-540-74997-4_12"},{"key":"27_CR29","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"160","DOI":"10.1007\/978-3-319-67401-8_18","volume-title":"Intelligent Virtual Agents","author":"D Greenwood","year":"2017","unstructured":"Greenwood, D., Laycock, S., Matthews, I.: Predicting head pose in dyadic conversation. In: IVA 2017. LNCS (LNAI), vol. 10498, pp. 160\u2013169. Springer, Cham (2017). https:\/\/doi.org\/10.1007\/978-3-319-67401-8_18"},{"key":"27_CR30","doi-asserted-by":"crossref","unstructured":"He, Y., et al.: Forgerynet: a versatile benchmark for comprehensive forgery analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4360\u20134369 (2021)","DOI":"10.1109\/CVPR46437.2021.00434"},{"key":"27_CR31","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"27_CR32","doi-asserted-by":"crossref","unstructured":"Hussen\u00a0Abdelaziz, A., Theobald, B.J., Dixon, P., Knothe, R., Apostoloff, N., Kajareker, S.: Modality dropout for improved performance-driven talking faces. In: Proceedings of the 2020 International Conference on Multimodal Interaction, pp. 378\u2013386 (2020)","DOI":"10.1145\/3382507.3418840"},{"key":"27_CR33","doi-asserted-by":"crossref","unstructured":"Ji, X., et al.: Audio-driven emotional video portraits. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 14080\u201314089 (2021)","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"27_CR34","unstructured":"Jonell, P., Kucherenko, T., Ekstedt, E., Beskow, J.: Learning non-verbal behavior for a social robot from youtube videos. In: ICDL-EpiRob Workshop on Naturalistic Non-Verbal and Affective Human-Robot Interactions, Oslo, Norway, 19 August 2019 (2019)"},{"key":"27_CR35","doi-asserted-by":"crossref","unstructured":"Jonell, P., Kucherenko, T., Henter, G.E., Beskow, J.: Let\u2019s face it: probabilistic multi-modal interlocutor-aware generation of facial gestures in dyadic settings. In: Proceedings of the 20th ACM International Conference on Intelligent Virtual Agents, pp.\u00a01\u20138 (2020)","DOI":"10.1145\/3383652.3423911"},{"issue":"4","key":"27_CR36","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. 36(4), 1\u201312 (2017)","journal-title":"ACM Trans. Graph."},{"issue":"4","key":"27_CR37","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3197517.3201283","volume":"37","author":"H Kim","year":"2018","unstructured":"Kim, H., et al.: Deep video portraits. ACM Trans. Graph. 37(4), 1\u201314 (2018)","journal-title":"ACM Trans. Graph."},{"key":"27_CR38","doi-asserted-by":"crossref","unstructured":"Kucherenko, T., et al.: The genea challenge 2023: a large-scale evaluation of gesture generation models in monadic and dyadic settings. In: Proceedings of the 25th International Conference on Multimodal Interaction, pp. 792\u2013801 (2023)","DOI":"10.1145\/3577190.3616120"},{"key":"27_CR39","doi-asserted-by":"crossref","unstructured":"Lahiri, A., Kwatra, V., Frueh, C., Lewis, J., Bregler, C.: Lipsync3d: data-efficient learning of personalized 3d talking faces from video using pose and lighting normalization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2755\u20132764 (2021)","DOI":"10.1109\/CVPR46437.2021.00278"},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Li, H., Yu, J., Ye, Y., Bregler, C.: Realtime facial animation with on-the-fly correctives. ACM Trans. Graph. 32(4), 42-1 (2013)","DOI":"10.1145\/2461912.2462019"},{"key":"27_CR41","doi-asserted-by":"crossref","unstructured":"Li, R., Yang, S., Ross, D.A., Kanazawa, A.: Ai choreographer: music conditioned 3d dance generation with aist++. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13401\u201313412 (2021)","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"27_CR42","unstructured":"Liu, X., et al.: Audio-driven co-speech gesture video generation. arXiv preprint arXiv:2212.02350 (2022)"},{"issue":"6","key":"27_CR43","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2816795.2818130","volume":"34","author":"Y Liu","year":"2015","unstructured":"Liu, Y., Xu, F., Chai, J., Tong, X., Wang, L., Huo, Q.: Video-audio driven real-time facial animation. ACM Trans. Graph. 34(6), 1\u201310 (2015)","journal-title":"ACM Trans. Graph."},{"key":"27_CR44","doi-asserted-by":"crossref","unstructured":"Massaro, D., Cohen, M., Tabain, M., Beskow, J., Clark, R.: Animated speech: research progress and applications. In: Audiovisual Speech Processing, pp. 309\u2013345 (2012)","DOI":"10.1017\/CBO9780511843891.014"},{"key":"27_CR45","doi-asserted-by":"crossref","unstructured":"Mirsamadi, S., Barsoum, E., Zhang, C.: Automatic speech emotion recognition using recurrent neural networks with local attention. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 2227\u20132231. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952552"},{"key":"27_CR46","unstructured":"Ng, E.: Learning2listen. https:\/\/evonneng.github.io\/learning2listen\/"},{"key":"27_CR47","doi-asserted-by":"crossref","unstructured":"Ng, E., et al.: Learning to listen: modeling non-deterministic dyadic facial motion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20395\u201320405 (2022)","DOI":"10.1109\/CVPR52688.2022.01975"},{"key":"27_CR48","doi-asserted-by":"crossref","unstructured":"Ng, E., Subramanian, S., Klein, D., Kanazawa, A., Darrell, T., Ginosar, S.: Can language models learn to listen? In: Proceedings of the International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00925"},{"key":"27_CR49","unstructured":"Nojavanasghari, B., Huang, Y., Khan, S.: Interactive generative adversarial networks for facial expression generation in dyadic interactions. arXiv preprint arXiv:1801.09092 (2018)"},{"key":"27_CR50","unstructured":"van\u00a0den Oord, A., Vinyals, O., Kavukcuoglu, K.: Neural discrete representation learning. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, pp. 6309\u20136318 (2017)"},{"key":"27_CR51","unstructured":"Palmero, C., et\u00a0al.: Chalearn lap challenges on self-reported personality recognition and non-verbal behavior forecasting during social dyadic interactions: dataset, design, and results. In: Understanding Social Behavior in Dyadic and Small Group Interactions, pp. 4\u201352. PMLR (2022)"},{"key":"27_CR52","doi-asserted-by":"crossref","unstructured":"Pham, H.X., Wang, Y., Pavlovic, V.: End-to-end learning for 3d facial animation from speech. In: Proceedings of the ACM International Conference on Multimodal Interaction, pp. 361\u2013365 (2018)","DOI":"10.1145\/3242969.3243017"},{"key":"27_CR53","doi-asserted-by":"crossref","unstructured":"Prajwal, K., Mukhopadhyay, R., Namboodiri, V.P., Jawahar, C.: A lip sync expert is all you need for speech to lip generation in the wild. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 484\u2013492 (2020)","DOI":"10.1145\/3394171.3413532"},{"key":"27_CR54","doi-asserted-by":"crossref","unstructured":"Reece, A., et al.: The candor corpus: insights from a large multimodal dataset of naturalistic conversation. Sci. Adv. 9(13), eadf3197 (2023)","DOI":"10.1126\/sciadv.adf3197"},{"key":"27_CR55","doi-asserted-by":"crossref","unstructured":"Ren, Y., Li, G., Chen, Y., Li, T.H., Liu, S.: Pirenderer: controllable portrait image generation via semantic neural rendering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13759\u201313768 (2021)","DOI":"10.1109\/ICCV48922.2021.01350"},{"key":"27_CR56","doi-asserted-by":"crossref","unstructured":"Richard, A., Zollh\u00f6fer, M., Wen, Y., de\u00a0la Torre, F., Sheikh, Y.: Meshtalk: 3d face animation from speech using cross-modality disentanglement. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1173\u20131182 (2021)","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"27_CR57","unstructured":"R\u00f6ssler, A., Cozzolino, D., Verdoliva, L., Riess, C., Thies, J., Nie\u00dfner, M.: Faceforensics: a large-scale video dataset for forgery detection in human faces. arXiv preprint arXiv:1803.09179 (2018)"},{"key":"27_CR58","doi-asserted-by":"publisher","unstructured":"Song, L., et al.: Adaptive face forgery detection in cross domain. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. Lecture Notes in Computer Science, pp. 467\u2013484. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19830-4_27","DOI":"10.1007\/978-3-031-19830-4_27"},{"key":"27_CR59","doi-asserted-by":"crossref","unstructured":"Song, L., Li, X., Fang, Z., Jin, Z., Chen, Y., Xu, C.: Face forgery detection via symmetric transformer. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4102\u20134111 (2022)","DOI":"10.1145\/3503161.3547806"},{"key":"27_CR60","doi-asserted-by":"crossref","unstructured":"Song, L., Liu, B., Yin, G., Dong, X., Zhang, Y., Bai, J.X.: Tacr-net: editing on deep video and voice portraits. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 478\u2013486 (2021)","DOI":"10.1145\/3474085.3475196"},{"key":"27_CR61","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1007\/978-3-030-87361-5_61","volume-title":"Image and Graphics","author":"L Song","year":"2021","unstructured":"Song, L., Liu, B., Yu, N.: Talking face video generation with\u00a0editable expression. In: Peng, Y., Hu, S.-M., Gabbouj, M., Zhou, K., Elad, M., Xu, K. (eds.) ICIG 2021. LNCS, vol. 12890, pp. 753\u2013764. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-87361-5_61"},{"key":"27_CR62","doi-asserted-by":"crossref","unstructured":"Song, L., Yin, G., Jin, Z., Dong, X., Xu, C.: Emotional listener portrait: realistic listener motion simulation in conversation. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 20782\u201320792. IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.01905"},{"key":"27_CR63","doi-asserted-by":"crossref","unstructured":"Song, L., Yin, G., Liu, B., Zhang, Y., Yu, N.: Fsft-net: face transfer video generation with few-shot views. In: 2021 IEEE International Conference on Image Processing (ICIP), pp. 3582\u20133586. IEEE (2021)","DOI":"10.1109\/ICIP42928.2021.9506512"},{"key":"27_CR64","doi-asserted-by":"crossref","unstructured":"Song, S., et\u00a0al.: React2023: the first multiple appropriate facial reaction generation challenge. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 9620\u20139624 (2023)","DOI":"10.1145\/3581783.3612832"},{"key":"27_CR65","doi-asserted-by":"crossref","unstructured":"Stan, S., Haque, K.I., Yumak, Z.: Facediffuser: speech-driven 3d facial animation synthesis using diffusion. In: Proceedings of the 16th ACM SIGGRAPH Conference on Motion, Interaction and Games, pp. 1\u201311 (2023)","DOI":"10.1145\/3623264.3624447"},{"issue":"4","key":"27_CR66","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073640","volume":"36","author":"S Suwajanakorn","year":"2017","unstructured":"Suwajanakorn, S., Seitz, S.M., Kemelmacher-Shlizerman, I.: Synthesizing obama: learning lip sync from audio. ACM Trans. Graph. 36(4), 1\u201313 (2017)","journal-title":"ACM Trans. Graph."},{"issue":"4","key":"27_CR67","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073699","volume":"36","author":"S Taylor","year":"2017","unstructured":"Taylor, S., et al.: A deep learning approach for generalized speech animation. ACM Trans. Graph. 36(4), 1\u201311 (2017)","journal-title":"ACM Trans. Graph."},{"key":"27_CR68","unstructured":", Taylor, S.L., Mahler, M., Theobald, B.J., Matthews, I.: Dynamic units of visual speech. In: Proceedings of the ACM SIGGRAPH\/Eurographics Conference on Computer Animation, pp. 275\u2013284 (2012)"},{"key":"27_CR69","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"716","DOI":"10.1007\/978-3-030-58517-4_42","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Thies","year":"2020","unstructured":"Thies, J., Elgharib, M., Tewari, A., Theobalt, C., Nie\u00dfner, M.: Neural voice puppetry: audio-driven facial reenactment. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12361, pp. 716\u2013731. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58517-4_42"},{"issue":"5","key":"27_CR70","doi-asserted-by":"publisher","first-page":"1398","DOI":"10.1007\/s11263-019-01251-8","volume":"128","author":"K Vougioukas","year":"2020","unstructured":"Vougioukas, K., Petridis, S., Pantic, M.: Realistic speech-driven facial animation with gans. Int. J. Comput. Vision 128(5), 1398\u20131413 (2020)","journal-title":"Int. J. Comput. Vision"},{"key":"27_CR71","unstructured":"Wang, T.C., et al.: Video-to-video synthesis. arXiv preprint arXiv:1808.06601 (2018)"},{"issue":"4","key":"27_CR72","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2010324.1964972","volume":"30","author":"T Weise","year":"2011","unstructured":"Weise, T., Bouaziz, S., Li, H., Pauly, M.: Realtime performance-based facial animation. ACM Trans. Graph. 30(4), 1\u201310 (2011)","journal-title":"ACM Trans. Graph."},{"key":"27_CR73","unstructured":"Woo, J., Fares, M., Pelachaud, C., Achard, C.: Amii: adaptive multimodal inter-personal and intra-personal model for adapted behavior synthesis. arXiv preprint arXiv:2305.11310 (2023)"},{"key":"27_CR74","doi-asserted-by":"crossref","unstructured":"Xing, J., Xia, M., Zhang, Y., Cun, X., Wang, J., Wong, T.T.: Codetalker: speech-driven 3d facial animation with discrete motion prior. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12780\u201312790 (2023)","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"27_CR75","doi-asserted-by":"crossref","unstructured":"Xu, Y., Feng, A.W., Marsella, S., Shapiro, A.: A practical and configurable lip sync method for games. In: Proceedings of Motion on Games, pp. 131\u2013140 (2013)","DOI":"10.1145\/2522628.2522904"},{"key":"27_CR76","unstructured":"Yi, R., Ye, Z., Zhang, J., Bao, H., Liu, Y.J.: Audio-driven talking face video generation with learning-based personalized head pose. arXiv preprint arXiv:2002.10137 (2020)"},{"key":"27_CR77","doi-asserted-by":"crossref","unstructured":"Yu, J., Chen, C.W.: From talking head to singing head: a significant enhancement for more natural human computer interaction. In: 2017 IEEE International Conference on Multimedia and Expo (ICME), pp. 511\u2013516. IEEE (2017)","DOI":"10.1109\/ICME.2017.8019362"},{"key":"27_CR78","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Li, L., Ding, Y., Fan, C.: Flow-guided one-shot talking face generation with a high-resolution audio-visual dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3661\u20133670 (2021)","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"27_CR79","doi-asserted-by":"crossref","unstructured":"Zhou, H., Sun, Y., Wu, W., Loy, C.C., Wang, X., Liu, Z.: Pose-controllable talking face generation by implicitly modularized audio-visual representation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4176\u20134186 (2021)","DOI":"10.1109\/CVPR46437.2021.00416"},{"key":"27_CR80","doi-asserted-by":"publisher","unstructured":"Zhou, M., Bai, Y., Zhang, W., Yao, T., Zhao, T., Mei, T.: Responsive listening head generation: a benchmark dataset and baseline. In: European Conference on Computer Vision, pp. 124\u2013142. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19839-7_8","DOI":"10.1007\/978-3-031-19839-7_8"},{"issue":"6","key":"27_CR81","first-page":"1","volume":"39","author":"Y Zhou","year":"2020","unstructured":"Zhou, Y., Han, X., Shechtman, E., Echevarria, J., Kalogerakis, E., Li, D.: Makelttalk: speaker-aware talking-head animation. ACM Trans. Graph. (TOG) 39(6), 1\u201315 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"4","key":"27_CR82","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3197517.3201292","volume":"37","author":"Y Zhou","year":"2018","unstructured":"Zhou, Y., Xu, Z., Landreth, C., Kalogerakis, E., Maji, S., Singh, K.: Visemenet: audio-driven animator-centric speech animation. ACM Trans. Graph. 37(4), 1\u201310 (2018)","journal-title":"ACM Trans. Graph."},{"key":"27_CR83","doi-asserted-by":"crossref","unstructured":"Zollh\u00f6fer, M., et al.: State of the art on monocular 3d face reconstruction, tracking, and applications. In: Computer Graphics Forum, pp. 523\u2013550 (2018)","DOI":"10.1111\/cgf.13382"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72913-3_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T23:27:22Z","timestamp":1733095642000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72913-3_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"ISBN":["9783031729126","9783031729133"],"references-count":83,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72913-3_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}