{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:25:40Z","timestamp":1765308340218,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62402136"],"award-info":[{"award-number":["62402136"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Natural Science Foundation of Shandong Province","award":["ZR2024QF064"],"award-info":[{"award-number":["ZR2024QF064"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755093","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"9733-9742","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["REA-Listener: Real-Time Listening Head Generation with Dynamic Emotion Modeling and Flexible Modality Adaptation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2797-0389","authenticated-orcid":false,"given":"Sizhe","family":"Zhao","sequence":"first","affiliation":[{"name":"Harbin Institute of Technology, Weihai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1865-7572","authenticated-orcid":false,"given":"Chenyang","family":"Wang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Weihai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2661-3692","authenticated-orcid":false,"given":"Weiyu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Weihai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4181-310X","authenticated-orcid":false,"given":"Zonglin","family":"Li","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Weihai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1032-7281","authenticated-orcid":false,"given":"Ming","family":"Li","sequence":"additional","affiliation":[{"name":"Shandong Inspur Database Technology Co., Ltd, Jinan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5200-3420","authenticated-orcid":false,"given":"Shengping","family":"Zhang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Weihai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596730"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/1891903.1891910"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/192161.192272"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00802"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612869"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1838"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01967"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00812"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3450626.3459936"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02069"},{"key":"e_1_3_2_1_12_1","volume-title":"Affective faces for goal-driven dyadic communication. arXiv preprint arXiv:2301.10939","author":"Geng Scott","year":"2023","unstructured":"Scott Geng, Revant Teotia, Purva Tendulkar, Sachit Menon, and Carl Vondrick. 2023. Affective faces for goal-driven dyadic communication. arXiv preprint arXiv:2301.10939 (2023)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00573"},{"key":"e_1_3_2_1_14_1","volume-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_15_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems 33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020), 6840-6851."},{"key":"e_1_3_2_1_16_1","volume-title":"Long short-term memory. Neural computation 9, 8","author":"Hochreiter Sepp","year":"1997","unstructured":"Sepp Hochreiter and J\u00fcrgen Schmidhuber. 1997. Long short-term memory. Neural computation 9, 8 (1997), 1735-1780."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551577"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530745"},{"key":"e_1_3_2_1_19_1","volume-title":"The Thirteenth International Conference on Learning Representations.","author":"Jiang Jianwen","year":"2024","unstructured":"Jianwen Jiang, Chao Liang, Jiaqi Yang, Gaojie Lin, Tianyun Zhong, and Yanbo Zheng. 2024. Loopy: Taming audio-driven portrait avatar with long-term motion dependency. In The Thirteenth International Conference on Learning Representations."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383652.3423911"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130813"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00338"},{"key":"e_1_3_2_1_24_1","unstructured":"Aixin Liu Bei Feng Bing Xue BingxuanWang BochaoWu Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan et al. 2024. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612123"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681182"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00234"},{"key":"e_1_3_2_1_28_1","first-page":"21386","article-title":"Audio-driven co-speech gesture video generation","volume":"35","author":"Liu Xian","year":"2022","unstructured":"Xian Liu, Qianyi Wu, Hang Zhou, Yuanqi Du, Wayne Wu, Dahua Lin, and Ziwei Liu. 2022. Audio-driven co-speech gesture video generation. Advances in Neural Information Processing Systems 35 (2022), 21386-21399.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_29_1","volume-title":"Juhyun Lee, et al.","author":"Lugaresi Camillo","year":"2019","unstructured":"Camillo Lugaresi, Jiuqiang Tang, Hadon Nash, Chris McClanahan, Esha Uboweja, Michael Hays, Fan Zhang, Chuo-Ling Chang, Ming Guang Yong, Juhyun Lee, et al. 2019. Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:1906.08172 (2019)."},{"key":"e_1_3_2_1_30_1","volume-title":"Dreamtalk: When expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767 2, 3","author":"Ma Yifeng","year":"2023","unstructured":"Yifeng Ma, Shiwei Zhang, Jiayu Wang, Xiang Wang, Yingya Zhang, and Zhidong Deng. 2023. Dreamtalk: When expressive talking head generation meets diffusion probabilistic models. arXiv preprint arXiv:2312.09767 2, 3 (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2011.2131660"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01975"},{"key":"e_1_3_2_1_33_1","volume-title":"Hyeongseop Rha, Minsu Kim, Joanna Hong, Jeong Hun Yeo, and Yong Man Ro.","author":"Park Se Jin","year":"2024","unstructured":"Se Jin Park, Chae Won Kim, Hyeongseop Rha, Minsu Kim, Joanna Hong, Jeong Hun Yeo, and Yong Man Ro. 2024. Let's Go Real Talk: Spoken Dialogue Model for Face-to-Face Conversation. arXiv preprint arXiv:2406.07867 (2024)."},{"key":"e_1_3_2_1_34_1","first-page":"296","article-title":"A 3D face model for pose and illumination invariant face recognition. In 2009 sixth IEEE international conference on advanced video and signal based surveillance","author":"Paysan Pascal","year":"2009","unstructured":"Pascal Paysan, Reinhard Knothe, Brian Amberg, Sami Romdhani, and Thomas Vetter. 2009. A 3D face model for pose and illumination invariant face recognition. In 2009 sixth IEEE international conference on advanced video and signal based surveillance. Ieee, 296-301.","journal-title":"Ieee"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_2_1_36_1","volume-title":"The Twelfth International Conference on Learning Representations, ICLR 2024","author":"Puigcerver Joan","year":"2024","unstructured":"Joan Puigcerver, Carlos Riquelme Ruiz, Basil Mustafa, and Neil Houlsby. 2024. From Sparse to Soft Mixtures of Experts. In The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024."},{"key":"e_1_3_2_1_37_1","volume-title":"International conference on machine learning. PMLR, 28492-28518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, JongWook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International conference on machine learning. PMLR, 28492-28518."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01350"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 20839-20849","author":"Song Luchuan","year":"2023","unstructured":"Luchuan Song, Guojun Yin, Zhenchao Jin, Xiaoyi Dong, and Chenliang Xu. 2023. Emotional listener portrait: Neural listener head generation with emotion. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 20839-20849."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00502"},{"key":"e_1_3_2_1_41_1","volume-title":"Resolution-robust Large Mask Inpainting with Fourier Convolutions. arXiv preprint arXiv:2109.07161","author":"Suvorov Roman","year":"2021","unstructured":"Roman Suvorov, Elizaveta Logacheva, Anton Mashikhin, Anastasia Remizova, Arsenii Ashukha, Aleksei Silvestrov, Naejin Kong, Harshith Goka, Kiwoong Park, and Victor Lempitsky. 2021. Resolution-robust Large Mask Inpainting with Fourier Convolutions. arXiv preprint arXiv:2109.07161 (2021)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02486"},{"key":"e_1_3_2_1_43_1","volume-title":"DIM: Dyadic Interaction Modeling for Social Behavior Generation. In European Conference on Computer Vision. Springer, 484-503","author":"Tran Minh","year":"2024","unstructured":"Minh Tran, Di Chang, Maksim Siniukov, and Mohammad Soleymani. 2024. DIM: Dyadic Interaction Modeling for Social Behavior Generation. In European Conference on Computer Vision. Springer, 484-503."},{"key":"e_1_3_2_1_44_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_45_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_46_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing 13, 4 (2004), 600-612."},{"key":"e_1_3_2_1_47_1","first-page":"660","article-title":"Vasa-1: Lifelike audio-driven talking faces generated in real time","volume":"37","author":"Xu Sicheng","year":"2024","unstructured":"Sicheng Xu, Guojun Chen, Yu-Xiao Guo, Jiaolong Yang, Chong Li, Zhenyu Zang, Yizhong Zhang, Xin Tong, and Baining Guo. 2024. Vasa-1: Lifelike audio-driven talking faces generated in real time. Advances in Neural Information Processing Systems 37 (2024), 660-684.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00836"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00366"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_8"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3414685.3417774","article-title":"Makelttalk: speaker-aware talking-head animation","volume":"39","author":"Zhou Yang","year":"2020","unstructured":"Yang Zhou, Xintong Han, Eli Shechtman, Jose Echevarria, Evangelos Kalogerakis, and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions On Graphics (TOG) 39, 6 (2020), 1-15.","journal-title":"ACM Transactions On Graphics (TOG)"},{"key":"e_1_3_2_1_52_1","volume-title":"INFP: Audio-driven interactive head generation in dyadic conversations. arXiv preprint arXiv:2412.04037","author":"Zhu Yongming","year":"2024","unstructured":"Yongming Zhu, Longhao Zhang, Zhengkun Rong, Tianshu Hu, Shuang Liang, and Zhipeng Ge. 2024. INFP: Audio-driven interactive head generation in dyadic conversations. arXiv preprint arXiv:2412.04037 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755093","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:20:55Z","timestamp":1765308055000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755093"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":52,"alternative-id":["10.1145\/3746027.3755093","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755093","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}