{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:59:16Z","timestamp":1776931156182,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":90,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100004063","name":"Knut och Alice Wallenbergs Stiftelse","doi-asserted-by":"publisher","award":[""],"award-info":[{"award-number":[""]}],"id":[{"id":"10.13039\/501100004063","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,13]]},"DOI":"10.1145\/3716553.3750747","type":"proceedings-article","created":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T13:13:16Z","timestamp":1760188396000},"page":"238-248","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Speech-to-Joy: Self-Supervised Features for Enjoyment Prediction in Human\u2013Robot Conversation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-1152-6457","authenticated-orcid":false,"given":"Ricardo","family":"Santana","sequence":"first","affiliation":[{"name":"KTH Royal Institute of Technology, Stockholm, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7983-079X","authenticated-orcid":false,"given":"Bahar","family":"Irfan","sequence":"additional","affiliation":[{"name":"KTH Royal Institute of Technology, Stockholm, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8937-8063","authenticated-orcid":false,"given":"Erik","family":"Lagerstedt","sequence":"additional","affiliation":[{"name":"Department of Philosophy, Linguistics and Theory of Science, University of Gothenburg, Gothenburg, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8579-1790","authenticated-orcid":false,"given":"Gabriel","family":"Skantze","sequence":"additional","affiliation":[{"name":"KTH, Stockholm, Sweden"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2428-0468","authenticated-orcid":false,"given":"Andre","family":"Pereira","sequence":"additional","affiliation":[{"name":"KTH Royal Institute of Technology, Stockholm, Sweden"}]}],"member":"320","published-online":{"date-parts":[[2025,10,12]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Ali Abedi and Shehroz\u00a0S Khan. 2024. Affect-driven ordinal engagement measurement from video. Multimedia Tools and Applications 83 8 (2024) 24899\u201324918.","DOI":"10.1007\/s11042-023-16345-2"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-34584-5_9"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/1896\/1\/012004"},{"key":"e_1_3_3_2_5_2","unstructured":"Alexei Baevski Yuhao Zhou Abdelrahman Mohamed and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances in neural information processing systems 33 (2020) 12449\u201312460."},{"key":"e_1_3_3_2_6_2","unstructured":"Dzmitry Bahdanau Kyunghyun Cho and Yoshua Bengio. 2014. Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1409.0473 (2014)."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1017\/9781009424202"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"crossref","unstructured":"Erdenebileg Batbaatar Meijing Li and Keun\u00a0Ho Ryu. 2019. Semantic-emotion neural network for emotion recognition from text. IEEE access 7 (2019) 111866\u2013111878.","DOI":"10.1109\/ACCESS.2019.2934529"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"crossref","unstructured":"Tony Belpaeme James Kennedy Aditi Ramachandran Brian Scassellati and Fumihide Tanaka. 2018. Social robots for education: A review. Science robotics 3 21 (2018) eaat5954.","DOI":"10.1126\/scirobotics.aat5954"},{"key":"e_1_3_3_2_10_2","first-page":"4","volume-title":"ICML","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In ICML , Vol.\u00a02. 4."},{"key":"e_1_3_3_2_11_2","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared\u00a0D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et\u00a0al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877\u20131901."},{"key":"e_1_3_3_2_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"crossref","unstructured":"Zebang Cheng Zhi-Qi Cheng Jun-Yan He Kai Wang Yuxiang Lin Zheng Lian Xiaojiang Peng and Alexander Hauptmann. 2024. Emotion-llama: Multimodal emotion recognition and reasoning with instruction tuning. Advances in Neural Information Processing Systems 37 (2024) 110805\u2013110853.","DOI":"10.52202\/079017-3518"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Ana Cisnal Victor Moreno-SanJuan Juan\u00a0Carlos Fraile Javier\u00a0P Turiel Eusebio De-la Fuente and Guillermo S\u00e1nchez-Brizuela. 2022. Assessment of the patient\u2019s emotional response with the robhand rehabilitation platform: A case series study. Journal of Clinical Medicine 11 15 (2022) 4442.","DOI":"10.3390\/jcm11154442"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-94-017-9088-8"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Kerstin Dautenhahn. 2007. Socially intelligent robots: dimensions of human\u2013robot interaction. Philosophical transactions of the royal society B: Biological sciences 362 1480 (2007) 679\u2013704.","DOI":"10.1098\/rstb.2006.2004"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86970-0_39"},{"key":"e_1_3_3_2_18_2","first-page":"4171","volume-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 4171\u20134186."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"crossref","unstructured":"Paul Ekman Tim Dalgleish and M Power. 1999. Basic emotions. San Francisco USA (1999).","DOI":"10.1002\/0470013494.ch3"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Kaouther Ezzameli and Hela Mahersia. 2023. Emotion recognition from unimodal to multimodal analysis: A review. Information Fusion 99 (2023) 101847.","DOI":"10.1016\/j.inffus.2023.101847"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"crossref","unstructured":"Luke\u00a0K Fryer and Daniel\u00a0L Dinsmore. 2020. The Promise and Pitfalls of Self-report: Development research design and analysis issues and multiple methods. Frontline Learning Research (2020).","DOI":"10.14786\/flr.v8i3.623"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICBIR57571.2023.10147577"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/RO-MAN57019.2023.10309450"},{"key":"e_1_3_3_2_26_2","doi-asserted-by":"crossref","unstructured":"Marcel Heerink Ben Kr\u00f6se Vanessa Evers and Bob Wielinga. 2010. Assessing acceptance of assistive social agent technology by older adults: the almere model.","DOI":"10.1007\/s12369-010-0068-5"},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/1349822.1349838"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"Sanoar Hossain Saiyed Umer Ranjeet\u00a0Kr Rout and M Tanveer. 2023. Fine-grained image analysis for facial expression recognition using deep convolutional neural networks with bilinear pooling. Applied Soft Computing 134 (2023) 109997.","DOI":"10.1016\/j.asoc.2023.109997"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Wei-Ning Hsu Benjamin Bolte Yao-Hung\u00a0Hubert Tsai Kushal Lakhotia Ruslan Salakhutdinov and Abdelrahman Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM transactions on audio speech and language processing 29 (2021) 3451\u20133460.","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"e_1_3_3_2_31_2","unstructured":"Edward\u00a0J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang Weizhu Chen et\u00a0al. 2022. Lora: Low-rank adaptation of large language models. ICLR 1 2 (2022) 3."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Takamasa Iio Yuichiro Yoshikawa Mariko Chiba Taichi Asami Yoshinori Isoda and Hiroshi Ishiguro. 2020. Twin-robot dialogue system with robustness against speech recognition failure in human-robot dialogue with elderly people. Applied Sciences 10 4 (2020) 1522.","DOI":"10.3390\/app10041522"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173386.3173389"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"crossref","unstructured":"Bahar Irfan Sanna Kuoppam\u00e4ki Aida Hosseini and Gabriel Skantze. 2025. Between Reality and Delusion: Challenges of Applying Large Language Models to Companion Robots for Open-Domain Dialogues with Older Adults. Autonomous Robots 49 9 (2025).","DOI":"10.1007\/s10514-025-10190-y"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"crossref","unstructured":"Bahar Irfan Jura Miniota Sofia Thunberg Erik Lagerstedt Sanna Kuoppam\u00e4ki Gabriel Skantze and Andr\u00e9 Pereira. 2024. Human-Robot Interaction Conversational User Enjoyment Scale (HRI CUES). arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.01354 (2024).","DOI":"10.1109\/TAFFC.2025.3590359"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/HRI61500.2025.10974215"},{"key":"e_1_3_3_2_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/HRI61500.2025.10973944"},{"key":"e_1_3_3_2_38_2","unstructured":"Albert\u00a0Q. Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra\u00a0Singh Chaplot Diego de\u00a0las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u00e9lio\u00a0Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven\u00a0Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William\u00a0El Sayed. 2023. Mistral 7B. arxiv:https:\/\/arXiv.org\/abs\/2310.06825\u00a0[cs.CL]"},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"Mustaqeem Khan Phuong-Nam Tran Nhat\u00a0Truong Pham Abdulmotaleb El\u00a0Saddik and Alice Othmani. 2025. MemoCMT: multimodal emotion recognition using cross-modal transformer-based feature fusion. Scientific reports 15 1 (2025) 5473.","DOI":"10.1038\/s41598-025-89202-x"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.5555\/3545946.3598702"},{"key":"e_1_3_3_2_42_2","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3678957.3685759"},{"key":"e_1_3_3_2_44_2","unstructured":"CU\u00a0Om Kumar N Gowtham Mohammed Zakariah and Absulaziz Almazyad. 2024. Multimodal emotion recognition using feature fusion: An llm-based approach. IEEE Access (2024)."},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"crossref","unstructured":"Maria Kyrarini Fotios Lygerakis Akilesh Rajavenkatanarayanan Christos Sevastopoulos Harish\u00a0Ram Nambiappan Kodur\u00a0Krishna Chaitanya Ashwin\u00a0Ramesh Babu Joanne Mathew and Fillia Makedon. 2021. A survey of robots in healthcare. Technologies 9 1 (2021) 8.","DOI":"10.3390\/technologies9010008"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446272"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"crossref","unstructured":"Kwan\u00a0Min Lee Younbo Jung Jaywoo Kim and Sang\u00a0Ryong Kim. 2006. Are physically embodied social agents better than disembodied social agents?: The effects of physical embodiment tactile interaction and people\u2019s loneliness in human\u2013robot interaction. International journal of human-computer studies 64 10 (2006) 962\u2013973.","DOI":"10.1016\/j.ijhcs.2006.05.002"},{"key":"e_1_3_3_2_48_2","first-page":"2803","volume-title":"Interspeech","author":"Li Yuanchao","year":"2019","unstructured":"Yuanchao Li, Tianyu Zhao, Tatsuya Kawahara, et\u00a0al. 2019. Improved end-to-end speech emotion recognition using self attention mechanism and multitask learning.. In Interspeech. 2803\u20132807."},{"key":"e_1_3_3_2_49_2","unstructured":"Ying-Chun Lin Jennifer Neville Jack\u00a0W Stokes Longqi Yang Tara Safavi Mengting Wan Scott Counts Siddharth Suri Reid Andersen Xiaofeng Xu et\u00a0al. 2024. Interpretable user satisfaction estimation for conversational systems with large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.12388 (2024)."},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"crossref","unstructured":"Xiao Liu Kaixuan Ji Yicheng Fu Weng\u00a0Lam Tam Zhengxiao Du Zhilin Yang and Jie Tang. 2021. P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2110.07602 (2021).","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"e_1_3_3_2_51_2","unstructured":"Yinhan Liu Myle Ott Naman Goyal Jingfei Du Mandar Joshi Danqi Chen Omer Levy Mike Lewis Luke Zettlemoyer and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1907.11692 (2019)."},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671552"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"crossref","unstructured":"Lu-Shih\u00a0Alex Low Namunu\u00a0C Maddage Margaret Lech Lisa\u00a0B Sheeber and Nicholas\u00a0B Allen. 2010. Detection of clinical depression in adolescents\u2019 speech during family interactions. IEEE transactions on biomedical engineering 58 3 (2010) 574\u2013586.","DOI":"10.1109\/TBME.2010.2091640"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"crossref","unstructured":"Yaobin Lu Tao Zhou and Bin Wang. 2009. Exploring Chinese users\u2019 acceptance of instant messaging using the theory of planned behavior the technology acceptance model and the flow theory. Computers in human behavior 25 1 (2009) 29\u201339.","DOI":"10.1016\/j.chb.2008.06.002"},{"key":"e_1_3_3_2_55_2","unstructured":"Tomas Mikolov Kai Chen Greg Corrado and Jeffrey Dean. 2013. Efficient estimation of word representations in vector space. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1301.3781 (2013)."},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-35699-6_16"},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"crossref","unstructured":"Ali Mollahosseini Behzad Hasani and Mohammad\u00a0H Mahoor. 2017. Affectnet: A database for facial expression valence and arousal computing in the wild. IEEE Transactions on Affective Computing 10 1 (2017) 18\u201331.","DOI":"10.1109\/TAFFC.2017.2740923"},{"key":"e_1_3_3_2_58_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747870"},{"key":"e_1_3_3_2_59_2","unstructured":"Niklas Muennighoff Nouamane Tazi Lo\u00efc Magne and Nils Reimers. 2022. MTEB: Massive Text Embedding Benchmark. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.07316 (2022)."},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i18.34157"},{"key":"e_1_3_3_2_61_2","unstructured":"Jianmo Ni Gustavo\u00a0Hernandez Abrego Noah Constant Ji Ma Keith\u00a0B Hall Daniel Cer and Yinfei Yang. 2021. Sentence-t5: Scalable sentence encoders from pre-trained text-to-text models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2108.08877 (2021)."},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447044"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3678957.3685729"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.5555\/265013"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"crossref","unstructured":"Jonathan Posner James\u00a0A Russell and Bradley\u00a0S Peterson. 2005. The circumplex model of affect: An integrative approach to affective neuroscience cognitive development and psychopathology. Development and psychopathology 17 3 (2005) 715\u2013734.","DOI":"10.1017\/S0954579405050340"},{"key":"e_1_3_3_2_68_2","volume-title":"The Power of Fun: Why fun is the key to a happy and healthy life","author":"Price Catherine","year":"2022","unstructured":"Catherine Price. 2022. The Power of Fun: Why fun is the key to a happy and healthy life. Random House."},{"key":"e_1_3_3_2_69_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_2_70_2","unstructured":"Colin Raffel Noam Shazeer Adam Roberts Katherine Lee Sharan Narang Michael Matena Yanqi Zhou Wei Li and Peter\u00a0J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research 21 140 (2020) 1\u201367."},{"key":"e_1_3_3_2_71_2","unstructured":"Vipula Rawte Amit Sheth and Amitava Das. 2023. A survey of hallucination in large foundation models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.05922 (2023)."},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"crossref","unstructured":"Merle\u00a0M. Reimann Florian\u00a0A. Kunneman Catharine Oertel and Koen\u00a0V. Hindriks. 2024. A Survey on Dialogue Management in Human-robot Interaction. J. Hum.-Robot Interact. 13 2 Article 22 (June 2024) 22\u00a0pages.","DOI":"10.1145\/3648605"},{"key":"e_1_3_3_2_73_2","unstructured":"Shafiq Rayhan Joty Caiming Xiong Yingbo Zhou Semih\u00a0Yavuz Rui\u00a0Meng* Ye\u00a0Liu*. 2024. SFR-Embedding-2: Advanced Text Embedding with Multi-stage Training. https:\/\/huggingface.co\/Salesforce\/SFR-Embedding-2_R"},{"key":"e_1_3_3_2_74_2","doi-asserted-by":"crossref","unstructured":"James\u00a0A Russell Anna Weiss and Gerald\u00a0A Mendelsohn. 1989. Affect grid: a single-item scale of pleasure and arousal. Journal of personality and social psychology 57 3 (1989) 493.","DOI":"10.1037\/0022-3514.57.3.493"},{"key":"e_1_3_3_2_75_2","unstructured":"Maarten Sap Ronan LeBras Daniel Fried and Yejin Choi. 2022. Neural theory-of-mind? on the limits of social intelligence in large lms. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.13312 (2022)."},{"key":"e_1_3_3_2_76_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-74889-2_18"},{"key":"e_1_3_3_2_77_2","doi-asserted-by":"crossref","unstructured":"Shamane Siriwardhana Tharindu Kaluarachchi Mark Billinghurst and Suranga Nanayakkara. 2020. Multimodal emotion recognition with transformer-based self supervised feature fusion. Ieee Access 8 (2020) 176274\u2013176285.","DOI":"10.1109\/ACCESS.2020.3026823"},{"key":"e_1_3_3_2_78_2","doi-asserted-by":"publisher","DOI":"10.1109\/HRI61500.2025.10973958"},{"key":"e_1_3_3_2_79_2","unstructured":"Rickard Stureborg Dimitris Alikaniotis and Yoshi Suhara. 2024. Large language models are inconsistent and biased evaluators. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.01724 (2024)."},{"key":"e_1_3_3_2_80_2","unstructured":"Gemma Team Aishwarya Kamath Johan Ferret Shreya Pathak Nino Vieillard Ramona Merhej Sarah Perrin Tatiana Matejovicova Alexandre Ram\u00e9 Morgane Rivi\u00e8re et\u00a0al. 2025. Gemma 3 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.19786 (2025)."},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"crossref","unstructured":"Antoine Toisoul Jean Kossaifi Adrian Bulat Georgios Tzimiropoulos and Maja Pantic. 2021. Estimation of continuous valence and arousal levels from faces in naturalistic conditions. Nature Machine Intelligence 3 1 (2021) 42\u201350.","DOI":"10.1038\/s42256-020-00280-0"},{"key":"e_1_3_3_2_82_2","unstructured":"Zhan Tong Yibing Song Jue Wang and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems 35 (2022) 10078\u201310093."},{"key":"e_1_3_3_2_83_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_3_2_84_2","doi-asserted-by":"publisher","DOI":"10.1145\/3551876.3554806"},{"key":"e_1_3_3_2_85_2","doi-asserted-by":"crossref","unstructured":"Johannes Wagner Andreas Triantafyllopoulos Hagen Wierstorf Maximilian Schmitt Felix Burkhardt Florian Eyben and Bj\u00f6rn\u00a0W Schuller. 2023. Dawn of the transformer era in speech emotion recognition: closing the valence gap. IEEE Transactions on Pattern Analysis and Machine Intelligence 45 9 (2023) 10745\u201310759.","DOI":"10.1109\/TPAMI.2023.3263585"},{"key":"e_1_3_3_2_86_2","unstructured":"Liang Wang Nan Yang Xiaolong Huang Linjun Yang Rangan Majumder and Furu Wei. 2023. Improving text embeddings with large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.00368 (2023)."},{"key":"e_1_3_3_2_87_2","doi-asserted-by":"crossref","unstructured":"Yuezhou Wu Siling Zhang and Pengfei Li. 2025. Multi-modal emotion recognition in conversation based on prompt learning with text-audio fusion features. Scientific Reports 15 1 (2025) 8855.","DOI":"10.1038\/s41598-025-89758-8"},{"key":"e_1_3_3_2_88_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.1994.323812"},{"key":"e_1_3_3_2_89_2","doi-asserted-by":"crossref","unstructured":"Chenghao Zhang and Lei Xue. 2021. Autoencoder with emotion embedding for speech emotion recognition. IEEE access 9 (2021) 51231\u201351241.","DOI":"10.1109\/ACCESS.2021.3069818"},{"key":"e_1_3_3_2_90_2","unstructured":"Yazhou Zhang Mengyao Wang Youxi Wu Prayag Tiwari Qiuchi Li Benyou Wang and Jing Qin. 2023. Dialoguellm: Context and emotion knowledge-tuned large language models for emotion recognition in conversations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.11374 (2023)."},{"key":"e_1_3_3_2_91_2","doi-asserted-by":"crossref","unstructured":"Zixing Zhang Liyizhe Peng Tao Pang Jing Han Huan Zhao and Bj\u00f6rn\u00a0W Schuller. 2024. Refashioning emotion recognition modelling: The advent of generalised large models. IEEE Transactions on Computational Social Systems (2024).","DOI":"10.1109\/TCSS.2024.3396345"}],"event":{"name":"ICMI '25: International Conference on Multimodal Interaction","location":"Canberra Australia","acronym":"ICMI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 27th International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3716553.3750747","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,26]],"date-time":"2026-01-26T22:27:18Z","timestamp":1769466438000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3716553.3750747"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":90,"alternative-id":["10.1145\/3716553.3750747","10.1145\/3716553"],"URL":"https:\/\/doi.org\/10.1145\/3716553.3750747","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-10-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}