{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:38:54Z","timestamp":1770917934061,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"Schweizerischer Nationalfonds zur F\u00f6rderung der Wissenschaftlichen Forschung","doi-asserted-by":"publisher","award":["216294"],"award-info":[{"award-number":["216294"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,21]]},"DOI":"10.1145\/3677388.3696336","type":"proceedings-article","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T08:12:19Z","timestamp":1730275939000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["EmoSpaceTime: Decoupling Emotion and Content through Contrastive Learning for Expressive 3D Speech Animation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-9782-8118","authenticated-orcid":false,"given":"Philine","family":"Witzig","sequence":"first","affiliation":[{"name":"Department of Computer Science, ETH Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7494-8660","authenticated-orcid":false,"given":"Barbara","family":"Solenthaler","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9324-779X","authenticated-orcid":false,"given":"Markus","family":"Gross","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0158-1305","authenticated-orcid":false,"given":"Rafael","family":"Wampfler","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH Zurich, Switzerland"}]}],"member":"320","published-online":{"date-parts":[[2024,11,21]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415877"},{"key":"e_1_3_2_2_2_1","volume-title":"Computer Graphics Forum, Vol.\u00a039","author":"Alexanderson Simon","unstructured":"Simon Alexanderson, Gustav\u00a0Eje Henter, Taras Kucherenko, and Jonas Beskow. 2020. Style-Controllable Speech-Driven Gesture Synthesis Using Normalising Flows. In Computer Graphics Forum, Vol.\u00a039. John Wiley & Sons, Hoboken, NJ, USA, 487\u2013496."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592458"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2022.3230541"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-020-0133-7"},{"key":"e_1_3_2_2_6_1","volume-title":"Computer Graphics Forum, Vol.\u00a041","author":"Chandran Prashanth","unstructured":"Prashanth Chandran, Gaspard Zoss, Markus Gross, Paulo Gotardo, and Derek Bradley. 2022. Facial animation with disentangled identity and motion using transformers. In Computer Graphics Forum, Vol.\u00a041. John Wiley & Sons, Hoboken, NJ, USA, 267\u2013277."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1002\/cav.2076"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747495"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550469.3555399"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00702"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01034"},{"key":"e_1_3_2_2_12_1","volume-title":"The Geneva affective picture database (GAPED): a new 730-picture database focusing on valence and normative significance. Behavior research methods 43","author":"Dan-Glauser S.","year":"2011","unstructured":"Elise\u00a0S. Dan-Glauser and Klaus\u00a0R. Scherer. 2011. The Geneva affective picture database (GAPED): a new 730-picture database focusing on valence and normative significance. Behavior research methods 43 (2011), 468\u2013477."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618183"},{"key":"e_1_3_2_2_14_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv:1810.04805\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1810.04805","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arxiv:1810.04805\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"e_1_3_2_2_15_1","volume-title":"Basic emotions. Handbook of cognition and emotion 98, 45-60","author":"Paul Ekman","year":"1999","unstructured":"Paul Ekman 1999. Basic emotions. Handbook of cognition and emotion 98, 45-60 (1999), 16."},{"key":"e_1_3_2_2_16_1","volume-title":"Facial action coding system","author":"Ekman Paul","unstructured":"Paul Ekman and Wallace\u00a0V Friesen. 1978. Facial action coding system. Consulting Psychologists Press, Sunnyvale, CA, USA."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01821"},{"key":"e_1_3_2_2_18_1","unstructured":"Epic Games. 2023. Live Link Face. Mobile Application. https:\/\/dev.epicgames.com\/community\/learning\/tutorials\/lEYe\/unreal-engine-facial-capture-with-live-link"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392480"},{"key":"e_1_3_2_2_20_1","unstructured":"Dan Hendrycks and Kevin Gimpel. 2023. Gaussian Error Linear Units (GELUs). arxiv:1606.08415\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1606.08415"},{"key":"e_1_3_2_2_21_1","volume-title":"International conference on learning representations, Vol.\u00a03. ICLR, OpenReview.net.","author":"Higgins Irina","year":"2017","unstructured":"Irina Higgins, Loic Matthey, Arka Pal, Christopher Burgess, Xavier Glorot, Matthew Botvinick, Shakir Mohamed, and Alexander Lerchner. 2017. beta-vae: Learning basic visual concepts with a constrained variational framework. In International conference on learning representations, Vol.\u00a03. ICLR, OpenReview.net."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392440"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925975"},{"key":"e_1_3_2_2_24_1","volume-title":"SIGGRAPH Asia 2015 technical briefs","author":"Holden Daniel","unstructured":"Daniel Holden, Jun Saito, Taku Komura, and Thomas Joyce. 2015. Learning motion manifolds with convolutional autoencoders. In SIGGRAPH Asia 2015 technical briefs. ACM, New York, NY, USA, 1\u20134."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2023.3259183"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01386"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073658"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3355089.3356500"},{"key":"e_1_3_2_2_29_1","unstructured":"Diederik\u00a0P Kingma and Max Welling. 2022. Auto-Encoding Variational Bayes. arxiv:1312.6114\u00a0[stat.ML] https:\/\/arxiv.org\/abs\/1312.6114"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.copsyc.2017.06.004"},{"key":"e_1_3_2_2_31_1","volume-title":"International Conference on Machine Learning. PMLR, PMLR, OpenAccess, 1558\u20131566","author":"Boesen\u00a0Lindbo Larsen Anders","year":"2016","unstructured":"Anders Boesen\u00a0Lindbo Larsen, S\u00f8ren\u00a0Kaae S\u00f8nderby, Hugo Larochelle, and Ole Winther. 2016. Autoencoding beyond pixels using a learned similarity metric. In International Conference on Machine Learning. PMLR, PMLR, OpenAccess, 1558\u20131566."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130813"},{"key":"e_1_3_2_2_33_1","volume-title":"Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Greg Kondrak and Taro Watanabe (Eds.). Asian Federation of Natural Language Processing","author":"Li Yanran","year":"2017","unstructured":"Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. 2017b. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. In Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Greg Kondrak and Taro Watanabe (Eds.). Asian Federation of Natural Language Processing, Taipei, Taiwan, 986\u2013995. https:\/\/aclanthology.org\/I17-1099"},{"key":"e_1_3_2_2_34_1","volume-title":"Proceedings, Part VII. Springer","author":"Liu Haiyang","year":"2022","unstructured":"Haiyang Liu, Zihao Zhu, Naoya Iwamoto, Yichen Peng, Zhengqing Li, You Zhou, Elif Bozkurt, and Bo Zheng. 2022. BEAT: A Large-Scale Semantic and Emotional Multi-Modal Dataset for Conversational Gestures Synthesis. In Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part VII. Springer, Cham, Switzerland, 612\u2013630."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.304"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.497"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01891"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.287"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"crossref","unstructured":"Robert Plutchik. 1982. A psychoevolutionary theory of emotions.","DOI":"10.1177\/053901882021004003"},{"key":"e_1_3_2_2_41_1","volume-title":"Train Short","author":"Press Ofir","unstructured":"Ofir Press, Noah\u00a0A. Smith, and Mike Lewis. 2022. Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation. arxiv:2108.12409\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2108.12409"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00009"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00121"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAU.1969.1162058"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623264.3624447"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00628"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073699"},{"key":"e_1_3_2_2_49_1","volume-title":"Anatomy of Facial Expression. Anatomy Next","author":"Uldis Zarins","unstructured":"Zarins Uldis. 2019. Anatomy of Facial Expression. Anatomy Next, Inc., Beacon, NY 12508, USA."},{"key":"e_1_3_2_2_50_1","unstructured":"Aaron van\u00a0den Oord Yazhe Li and Oriol Vinyals. 2019. Representation Learning with Contrastive Predictive Coding. arxiv:1807.03748\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1807.03748"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295378"},{"key":"e_1_3_2_2_52_1","volume-title":"Advances in Neural Information Processing Systems, I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.). Vol.\u00a030. Curran Associates","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141\u00a0ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.). Vol.\u00a030. Curran Associates, Inc., Red Hook, NY, USA.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_2_53_1","unstructured":"Qianyun Wang Zhenfeng Fan and Shihong Xia. 2021a. 3D-TalkEmo: Learning to Synthesize 3D Emotional Talking Head. arxiv:2104.12051\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2104.12051"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"crossref","unstructured":"Suzhen Wang Lincheng Li Yu Ding Changjie Fan and Xin Yu. 2021b. Audio2Head: Audio-driven One-shot Talking-head Generation with Natural Head Motion. arxiv:2107.09293\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2107.09293","DOI":"10.24963\/ijcai.2021\/152"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640794.3665541"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01052"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01229"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00212"},{"key":"e_1_3_2_2_59_1","unstructured":"Yifei Zeng Yuanxun Lu Xinya Ji Yao Yao Hao Zhu and Xun Cao. 2023. AvatarBooth: High-Quality and Customizable 3D Human Avatar Generation. arxiv:2306.09864\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2306.09864"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3618309"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3414685.3417774","article-title":"Makelttalk: speaker-aware talking-head animation","volume":"39","author":"Zhou Yang","year":"2020","unstructured":"Yang Zhou, Xintong Han, Eli Shechtman, Jose Echevarria, Evangelos Kalogerakis, and Dingzeyu Li. 2020. Makelttalk: speaker-aware talking-head animation. ACM Transactions On Graphics (TOG) 39, 6 (2020), 1\u201315.","journal-title":"ACM Transactions On Graphics (TOG)"}],"event":{"name":"MIG '24: The 17th ACM SIGGRAPH Conference on Motion, Interaction, and Games","location":"Arlington VA USA","acronym":"MIG '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["The 17th ACM SIGGRAPH Conference on Motion Interaction and Games"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3677388.3696336","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3677388.3696336","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T13:08:06Z","timestamp":1756472886000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3677388.3696336"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,21]]},"references-count":61,"alternative-id":["10.1145\/3677388.3696336","10.1145\/3677388"],"URL":"https:\/\/doi.org\/10.1145\/3677388.3696336","relation":{},"subject":[],"published":{"date-parts":[[2024,11,21]]},"assertion":[{"value":"2024-11-21","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}