{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T02:43:16Z","timestamp":1768272196206,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":39,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556755","type":"print"},{"value":"9789819556762","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5676-2_35","type":"book-chapter","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T20:32:10Z","timestamp":1768249930000},"page":"519-535","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["AsynFusion: Towards Asynchronous Latent Consistency Models for\u00a0Decoupled Whole-Body Audio-Driven Avatars"],"prefix":"10.1007","author":[{"given":"Tianbao","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Jian","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Yuer","family":"Li","sequence":"additional","affiliation":[]},{"given":"Zheng","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Ping","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Zhaoxin","family":"Fan","sequence":"additional","affiliation":[]},{"given":"Wenjun","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Xuelong","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"issue":"4","key":"35_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592458","volume":"42","author":"S Alexanderson","year":"2023","unstructured":"Alexanderson, S., Nagy, R., Beskow, J., Henter, G.E.: Listen, denoise, action! audio-driven motion synthesis with diffusion models. ACM Trans. Graph. (TOG) 42(4), 1\u201320 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"6","key":"35_CR2","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3550454.3555435","volume":"41","author":"T Ao","year":"2022","unstructured":"Ao, T., Gao, Q., Lou, Y., Chen, B., Liu, L.: Rhythmic gesticulator: rhythm-aware co-speech gesture synthesis with hierarchical neural embeddings. ACM Trans. Graph. (TOG) 41(6), 1\u201319 (2022)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"35_CR3","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)"},{"key":"35_CR4","doi-asserted-by":"crossref","unstructured":"Bhattacharya, U., Childs, E., Rewkowski, N., Manocha, D.: Speech2AffectiveGestures: synthesizing co-speech gestures with generative adversarial affective expression learning. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 2027\u20132036 (2021)","DOI":"10.1145\/3474085.3475223"},{"key":"35_CR5","doi-asserted-by":"crossref","unstructured":"Bhattacharya, U., Rewkowski, N., Banerjee, A., Guhan, P., Bera, A., Manocha, D.: Text2Gestures: a transformer-based network for generating emotive body gestures for virtual agents. In: 2021 IEEE Virtual Reality and 3D User Interfaces (VR), pp. 1\u201310. IEEE (2021)","DOI":"10.1109\/VR50410.2021.00037"},{"key":"35_CR6","doi-asserted-by":"crossref","unstructured":"Cassell, J., Vilhj\u00e1lmsson, H.H., Bickmore, T.: Beat: the behavior expression animation toolkit. In: Proceedings of the 28th Annual Conference on Computer Graphics and Interactive Techniques, pp. 477\u2013486 (2001)","DOI":"10.1145\/383259.383315"},{"key":"35_CR7","doi-asserted-by":"crossref","unstructured":"Chen, J., Liu, Y., Wang, J., Zeng, A., Li, Y., Chen, Q.: DiffSHEG: a diffusion-based approach for real-time speech-driven holistic 3D expression and gesture generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7352\u20137361 (2024)","DOI":"10.1109\/CVPR52733.2024.00702"},{"key":"35_CR8","doi-asserted-by":"crossref","unstructured":"Chhatre, K., Athanasiou, N., Becherini, G., Peters, C., Black, M.J., Bolkart, T., et\u00a0al.: Emotional speech-driven 3D body animation via disentangled latent diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1942\u20131953 (2024)","DOI":"10.1109\/CVPR52733.2024.00190"},{"key":"35_CR9","doi-asserted-by":"crossref","unstructured":"Dan\u011b\u010dek, R., Chhatre, K., Tripathi, S., Wen, Y., Black, M., Bolkart, T.: Emotional speech-driven animation with content-emotion disentanglement. In: SIGGRAPH Asia 2023 Conference Papers, pp. 1\u201313 (2023)","DOI":"10.1145\/3610548.3618183"},{"key":"35_CR10","doi-asserted-by":"crossref","unstructured":"Fan, Y., Lin, Z., Saito, J., Wang, W., Komura, T.: FaceFormer: speech-driven 3D facial animation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18770\u201318780 (2022)","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"35_CR11","doi-asserted-by":"crossref","unstructured":"Ginosar, S., Bar, A., Kohavi, G., Chan, C., Owens, A., Malik, J.: Learning individual styles of conversational gesture. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3497\u20133506 (2019)","DOI":"10.1109\/CVPR.2019.00361"},{"key":"35_CR12","doi-asserted-by":"crossref","unstructured":"Habibie, I., et al.: Learning speech-driven 3D conversational gestures from video. In: Proceedings of the 21st ACM International Conference on Intelligent Virtual Agents, pp. 101\u2013108 (2021)","DOI":"10.1145\/3472306.3478335"},{"key":"35_CR13","doi-asserted-by":"crossref","unstructured":"Hasegawa, D., Kaneko, N., Shirakawa, S., Sakuta, H., Sumi, K.: Evaluation of speech-to-gesture generation using bi-directional LSTM network. In: Proceedings of the 18th International Conference on Intelligent Virtual Agents, pp. 79\u201386 (2018)","DOI":"10.1145\/3267851.3267878"},{"key":"35_CR14","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"35_CR15","doi-asserted-by":"crossref","unstructured":"Huang, X., Belongie, S.: Arbitrary style transfer in real-time with adaptive instance normalization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1501\u20131510 (2017)","DOI":"10.1109\/ICCV.2017.167"},{"issue":"4","key":"35_CR16","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3072959.3073658","volume":"36","author":"T Karras","year":"2017","unstructured":"Karras, T., Aila, T., Laine, S., Herva, A., Lehtinen, J.: Audio-driven facial animation by joint end-to-end learning of pose and emotion. ACM Trans. Graph. (ToG) 36(4), 1\u201312 (2017)","journal-title":"ACM Trans. Graph. (ToG)"},{"key":"35_CR17","unstructured":"Kipp, M.: Gesture generation by imitation: From human behavior to computer character animation. Universal-Publishers (2005)"},{"key":"35_CR18","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"205","DOI":"10.1007\/11821830_17","volume-title":"Intelligent Virtual Agents","author":"S Kopp","year":"2006","unstructured":"Kopp, S., et al.: Towards a common framework for multimodal generation: the behavior markup language. In: Gratch, J., Young, M., Aylett, R., Ballin, D., Olivier, P. (eds.) IVA 2006. LNCS (LNAI), vol. 4133, pp. 205\u2013217. Springer, Heidelberg (2006). https:\/\/doi.org\/10.1007\/11821830_17"},{"key":"35_CR19","doi-asserted-by":"crossref","unstructured":"Kucherenko, T., et al.: Gesticulator: a framework for semantically-aware speech-driven gesture generation. In: Proceedings of the 2020 International Conference on Multimodal Interaction, pp. 242\u2013250 (2020)","DOI":"10.1145\/3382507.3418815"},{"key":"35_CR20","doi-asserted-by":"crossref","unstructured":"Liu, H., et al.: EMAGE: towards unified holistic co-speech gesture generation via expressive masked audio gesture modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1144\u20131154 (2024)","DOI":"10.1109\/CVPR52733.2024.00115"},{"key":"35_CR21","doi-asserted-by":"publisher","unstructured":"Liu, H., et al.: Beat: a large-scale semantic and emotional multi-modal dataset for conversational gestures synthesis. In: European Conference on Computer Vision, pp. 612\u2013630. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20071-7_36","DOI":"10.1007\/978-3-031-20071-7_36"},{"key":"35_CR22","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: Learning hierarchical cross-modal association for co-speech gesture generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10462\u201310472 (2022)","DOI":"10.1109\/CVPR52688.2022.01021"},{"key":"35_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Y., Cao, Q., Wen, Y., Jiang, H., Ding, C.: Towards variable and coordinated holistic co-speech motion generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1566\u20131576 (2024)","DOI":"10.1109\/CVPR52733.2024.00155"},{"issue":"6","key":"35_CR24","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2816795.2818130","volume":"34","author":"Y Liu","year":"2015","unstructured":"Liu, Y., Xu, F., Chai, J., Tong, X., Wang, L., Huo, Q.: Video-audio driven real-time facial animation. ACM Trans. Graph. (TOG) 34(6), 1\u201310 (2015)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"35_CR25","doi-asserted-by":"crossref","unstructured":"Liu, Y., Lin, L., Yu, F., Zhou, C., Li, Y.: MODA: mapping-once audio-driven portrait animation with dual attentions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 23020\u201323029 (2023)","DOI":"10.1109\/ICCV51070.2023.02104"},{"key":"35_CR26","unstructured":"Luo, S., Tan, Y., Huang, L., Li, J., Zhao, H.: Latent consistency models: synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378 (2023)"},{"issue":"1","key":"35_CR27","doi-asserted-by":"publisher","first-page":"486","DOI":"10.3390\/encyclopedia2010031","volume":"2","author":"S Mystakidis","year":"2022","unstructured":"Mystakidis, S.: Metaverse. Encyclopedia 2(1), 486\u2013497 (2022)","journal-title":"Encyclopedia"},{"key":"35_CR28","doi-asserted-by":"crossref","unstructured":"Pavlakos, G., et al.: Expressive body capture: 3D hands, face, and body from a single image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10975\u201310985 (2019)","DOI":"10.1109\/CVPR.2019.01123"},{"key":"35_CR29","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"35_CR30","doi-asserted-by":"crossref","unstructured":"Peng, Z., et al.: EmoTalk: speech-driven emotional disentanglement for 3D face animation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20687\u201320697 (2023)","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"35_CR31","doi-asserted-by":"crossref","unstructured":"Qi, X., Liu, C., Li, L., Hou, J., Xin, H., Yu, X.: EmotionGesture: audio-driven diverse emotional co-speech 3d gesture generation. IEEE Trans. Multimedia (2024)","DOI":"10.1109\/TMM.2024.3407692"},{"key":"35_CR32","doi-asserted-by":"publisher","unstructured":"Van\u00a0Mulken, S., Andre, E., M\u00fcller, J.: The persona effect: how substantial is it? In: People and computers XIII: Proceedings of HCI 1998, pp. 53\u201366. Springer (1998). https:\/\/doi.org\/10.1007\/978-1-4471-3605-7_4","DOI":"10.1007\/978-1-4471-3605-7_4"},{"issue":"1","key":"35_CR33","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1109\/COMST.2022.3202047","volume":"25","author":"Y Wang","year":"2022","unstructured":"Wang, Y., et al.: A survey on metaverse: fundamentals, security, and privacy. IEEE Commun. Surv. Tutorials 25(1), 319\u2013352 (2022)","journal-title":"IEEE Commun. Surv. Tutorials"},{"key":"35_CR34","doi-asserted-by":"crossref","unstructured":"Xing, J., Xia, M., Zhang, Y., Cun, X., Wang, J., Wong, T.T.: CodeTalker: speech-driven 3D facial animation with discrete motion prior. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12780\u201312790 (2023)","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"35_CR35","unstructured":"Xu, C., et al.: Combo: co-speech holistic 3D human motion generation and efficient customizable adaptation in harmony. arXiv preprint arXiv:2408.09397 (2024)"},{"key":"35_CR36","doi-asserted-by":"crossref","unstructured":"Yang, S., et al.: DiffuseStyleGesture: stylized audio-driven co-speech gesture generation with diffusion models. arXiv preprint arXiv:2305.04919 (2023)","DOI":"10.24963\/ijcai.2023\/650"},{"key":"35_CR37","doi-asserted-by":"crossref","unstructured":"Yi, H., et al.: Generating holistic 3D human motion from speech. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 469\u2013480 (2023)","DOI":"10.1109\/CVPR52729.2023.00053"},{"issue":"6","key":"35_CR38","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417838","volume":"39","author":"Y Yoon","year":"2020","unstructured":"Yoon, Y., et al.: Speech gesture generation from the trimodal context of text, audio, and speaker identity. ACM Trans. Graph. (TOG) 39(6), 1\u201316 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"35_CR39","doi-asserted-by":"crossref","unstructured":"Zhu, L., Liu, X., Liu, X., Qian, R., Liu, Z., Yu, L.: Taming diffusion models for audio-driven co-speech gesture generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10544\u201310553 (2023)","DOI":"10.1109\/CVPR52729.2023.01016"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5676-2_35","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T20:32:14Z","timestamp":1768249934000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5676-2_35"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556755","9789819556762"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5676-2_35","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"13 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}