{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T18:52:37Z","timestamp":1776106357044,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":45,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62271083"],"award-info":[{"award-number":["No. 62271083"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Key Project of the National Language Commission","award":["No. ZDI145-81"],"award-info":[{"award-number":["No. ZDI145-81"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["No. 2023RC73"],"award-info":[{"award-number":["No. 2023RC73"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762030","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:55:00Z","timestamp":1761375300000},"page":"14014-14020","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["EMO-Avatar: An LLM-Agent-Orchestrated Framework for Multimodal Emotional Support in Human Animation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-4251-699X","authenticated-orcid":false,"given":"Keqi","family":"Chen","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1576-9183","authenticated-orcid":false,"given":"Wenxin","family":"Fu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7258-7820","authenticated-orcid":false,"given":"Qihang","family":"Lu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1231-6581","authenticated-orcid":false,"given":"Zekai","family":"Sun","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5095-9318","authenticated-orcid":false,"given":"Yizhong","family":"Geng","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2001-5806","authenticated-orcid":false,"given":"Yi","family":"Liu","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6123-7937","authenticated-orcid":false,"given":"Puyuan","family":"Guo","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5881-3723","authenticated-orcid":false,"given":"Yingming","family":"Gao","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6284-5039","authenticated-orcid":false,"given":"Ya","family":"Li","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Canopy AI. 2025. Orpheus-TTS. GitHub repository. https:\/\/github.com\/canopyai\/Orpheus-TTS Accessed: 2025-07-20."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1080\/10503307.2019.1680901"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.3109\/09638237.2015.1079308"},{"key":"e_1_3_2_1_4_1","volume-title":"Handbook of communication and social interaction skills","author":"Burleson Brant R","unstructured":"Brant R Burleson. 2003. Emotional support skills. In Handbook of communication and social interaction skills. Routledge, 569-612."},{"key":"e_1_3_2_1_5_1","volume-title":"IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation","author":"Busso Carlos","year":"2008","unstructured":"Carlos Busso, Murtaza Bulut, Chi-Chun Lee, Abe Kazemzadeh, Emily Mower, Samuel Kim, Jeannette N Chang, Sungbok Lee, and Shrikanth S Narayanan. 2008. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation, Vol. 42, 4 (2008), 335-359."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i3.32241"},{"key":"e_1_3_2_1_7_1","volume-title":"Towards multimodal emotional support conversation systems. arXiv preprint arXiv:2408.03650","author":"Chu Yuqi","year":"2024","unstructured":"Yuqi Chu, Lizi Liao, Zhiyuan Zhou, Chong-Wah Ngo, and Richang Hong. 2024. Towards multimodal emotional support conversation systems. arXiv preprint arXiv:2408.03650 (2024)."},{"key":"e_1_3_2_1_8_1","volume-title":"Hallo2: Long-duration and high-resolution audio-driven portrait image animation. arXiv preprint arXiv:2410.07718","author":"Cui Jiahao","year":"2024","unstructured":"Jiahao Cui, Hui Li, Yao Yao, Hao Zhu, Hanlin Shang, Kaihui Cheng, Hang Zhou, Siyu Zhu, and Jingdong Wang. 2024. Hallo2: Long-duration and high-resolution audio-driven portrait image animation. arXiv preprint arXiv:2410.07718 (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"Ding Ding Zeqian Ju Yichong Leng Songxiang Liu Tong Liu Zeyu Shang Kai Shen Wei Song Xu Tan Heyi Tang et al. 2025. Kimi-audio technical report. arXiv preprint arXiv:2504.18425 (2025)."},{"key":"e_1_3_2_1_10_1","unstructured":"Zhihao Du Changfeng Gao Yuxuan Wang Fan Yu Tianyu Zhao Hao Wang Xiang Lv Hui Wang Chongjia Ni Xian Shi et al. 2025. Cosyvoice 3: Towards in-the-wild speech generation via scaling-up and post-training. arXiv preprint arXiv:2505.17589 (2025)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3689171"},{"key":"e_1_3_2_1_12_1","volume-title":"Therapeutic reasoning: planning, implementing, and evaluating the outcomes of therapy. Model of Human Occupation: Theory and Application","author":"Forsyth Kirsty","year":"2008","unstructured":"Kirsty Forsyth. 2008. Therapeutic reasoning: planning, implementing, and evaluating the outcomes of therapy. Model of Human Occupation: Theory and Application (2008), 143."},{"key":"e_1_3_2_1_13_1","volume-title":"Helping skills","author":"Hill Clara E","year":"1999","unstructured":"Clara E Hill and Karen M O'Brien. 1999. Helping skills. Washington, DC: American Psychological Association (1999)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1037\/0022-0167.55.3.359"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681245"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18401\/2017.7.1.3"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681406"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.chb.2011.03.008"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612053"},{"key":"e_1_3_2_1_20_1","unstructured":"Chunyu Li Chao Zhang Weikai Xu Jingyu Lin Jinghui Xie Weiguo Feng Bingyue Peng Cunjian Chen and Weiwei Xing. 2025. LatentSync: Taming Audio-Conditioned Latent Diffusion Models for Lip Sync with SyncNet Supervision. arXiv:2412.09262 [cs.CV] https:\/\/arxiv.org\/abs\/2412.09262"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681425"},{"key":"e_1_3_2_1_22_1","volume-title":"Towards emotional support dialog systems. arXiv preprint arXiv:2106.01144","author":"Liu Siyang","year":"2021","unstructured":"Siyang Liu, Chujie Zheng, Orianna Demasi, Sahand Sabour, Yu Li, Zhou Yu, Yong Jiang, and Minlie Huang. 2021. Towards emotional support dialog systems. arXiv preprint arXiv:2106.01144 (2021)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680674"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680705"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Meng Luo Han Zhang Shengqiong Wu Bobo Li Hong Han and Hao Fei. 2024b. NUS-Emo at SemEval-2024 Task 3: Instruction-Tuning LLM for Multimodal Emotion-Cause Analysis in Conversations. arXiv:2501.17261 [cs.CL] https:\/\/arxiv.org\/abs\/2501.17261","DOI":"10.18653\/v1\/2024.semeval-1.226"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681228"},{"key":"e_1_3_2_1_27_1","unstructured":"Nari Labs. 2025. dia: Dialogue Interaction Assistant. https:\/\/github.com\/nari-labs\/dia."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681146"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681359"},{"key":"e_1_3_2_1_30_1","volume-title":"The interaction of emotion and reasoning in contemporary talking therapy","author":"Silverman Katy","unstructured":"Katy Silverman. 2013. The interaction of emotion and reasoning in contemporary talking therapy. The University of Manchester (United Kingdom)."},{"key":"e_1_3_2_1_31_1","volume-title":"MISC: A mixed strategy-aware model integrating COMET for emotional support conversation. arXiv preprint arXiv:2203.13560","author":"Tu Quan","year":"2022","unstructured":"Quan Tu, Yanran Li, Jianwei Cui, Bin Wang, Ji-Rong Wen, and Rui Yan. 2022. MISC: A mixed strategy-aware model integrating COMET for emotional support conversation. arXiv preprint arXiv:2203.13560 (2022)."},{"key":"e_1_3_2_1_32_1","volume-title":"Spark-tts: An efficient llm-based text-to-speech model with single-stream decoupled speech tokens. arXiv preprint arXiv:2503.01710","author":"Wang Xinsheng","year":"2025","unstructured":"Xinsheng Wang, Mingqi Jiang, Ziyang Ma, Ziyu Zhang, Songxiang Liu, Linqin Li, Zheng Liang, Qixi Zheng, Rui Wang, Xiaoqin Feng, et al., 2025. Spark-tts: An efficient llm-based text-to-speech model with single-stream decoupled speech tokens. arXiv preprint arXiv:2503.01710 (2025)."},{"key":"e_1_3_2_1_33_1","volume-title":"Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694","author":"Wei Huawei","year":"2024","unstructured":"Huawei Wei, Zejun Yang, and Zhisheng Wang. 2024. Aniportrait: Audio-driven synthesis of photorealistic portrait animation. arXiv preprint arXiv:2403.17694 (2024)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681366"},{"key":"e_1_3_2_1_35_1","unstructured":"Jin Xu Zhifang Guo Jinzheng He Hangrui Hu Ting He Shuai Bai Keqin Chen Jialin Wang Yang Fan Kai Dang et al. 2025. Qwen2. 5-omni technical report. arXiv preprint arXiv:2503.20215 (2025)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681039"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680619"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681603"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Zhenhui Ye Tianyun Zhong Yi Ren Ziyue Jiang Jiawei Huang Rongjie Huang Jinglin Liu Jinzheng He Chen Zhang Zehan Wang et al. 2024b. MimicTalk: Mimicking a personalized and expressive 3D talking face in minutes. Advances in neural information processing systems Vol. 37 (2024) 1829-1853.","DOI":"10.52202\/079017-0058"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681328"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681394"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3696410.3714739"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680821"},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia. 6860-6869","author":"Zhang Weitian","year":"2024","unstructured":"Weitian Zhang, Yichao Yan, Yunhui Liu, Xingdong Sheng, and Xiaokang Yang. 2024. E 3Gen: Efficient, Expressive and Editable Avatars Generation. In Proceedings of the 32nd ACM International Conference on Multimedia. 6860-6869."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.3389\/fpsyg.2020.611248"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762030","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T03:57:45Z","timestamp":1765339065000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762030"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":45,"alternative-id":["10.1145\/3746027.3762030","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762030","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}