{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T15:18:52Z","timestamp":1777130332823,"version":"3.51.4"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031843525","type":"print"},{"value":"9783031843532","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-84353-2_16","type":"book-chapter","created":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T13:47:29Z","timestamp":1740491249000},"page":"184-194","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Investigating HuBERT-Based Speech Emotion Recognition Generalisation Capability"],"prefix":"10.1007","author":[{"given":"Letian","family":"Li","sequence":"first","affiliation":[]},{"given":"Cornelius","family":"Glackin","sequence":"additional","affiliation":[]},{"given":"Nigel","family":"Cannings","sequence":"additional","affiliation":[]},{"given":"Vito","family":"Veneziano","sequence":"additional","affiliation":[]},{"given":"Jack","family":"Barker","sequence":"additional","affiliation":[]},{"given":"Olakunle","family":"Oduola","sequence":"additional","affiliation":[]},{"given":"Chris","family":"Woodruff","sequence":"additional","affiliation":[]},{"given":"Thea","family":"Laird","sequence":"additional","affiliation":[]},{"given":"James","family":"Laird","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Sun","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,2,17]]},"reference":[{"issue":"4","key":"16_CR1","doi-asserted-by":"publisher","first-page":"1249","DOI":"10.3390\/s21041249","volume":"21","author":"BJ Abbaschian","year":"2021","unstructured":"Abbaschian, B.J., Sierra-Sosa, D., Elmaghraby, A.: Deep learning techniques for speech emotion recognition, from databases to models. Sensors 21(4), 1249 (2021)","journal-title":"Sensors"},{"key":"16_CR2","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inform. Process. Syst. 33, 12449\u201312460 (2020)"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Bagher\u00a0Zadeh, A., Liang, P.P., Poria, S., Cambria, E., Morency, L.P.: Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2236\u20132246. Association for Computational Linguistics, Melbourne, Australia (Jul 2018)","DOI":"10.18653\/v1\/P18-1208"},{"key":"16_CR4","unstructured":"Bommasani, R., et\u00a0al.: On the opportunities and risks of foundation models. arXiv preprint arXiv:2108.07258 (2021)"},{"key":"16_CR5","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso, C., et al.: Iemocap: interactive emotional dyadic motion capture database. Lang. Resour. Eval. 42, 335\u2013359 (2008)","journal-title":"Lang. Resour. Eval."},{"issue":"4","key":"16_CR6","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1109\/TAFFC.2014.2336244","volume":"5","author":"H Cao","year":"2014","unstructured":"Cao, H., Cooper, D.G., Keutmann, M.K., Gur, R.C., Nenkova, A., Verma, R.: Crema-d: crowd-sourced emotional multimodal actors dataset. IEEE Trans. Affect. Comput. 5(4), 377\u2013390 (2014)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"16_CR7","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"issue":"1","key":"16_CR8","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1109\/79.911197","volume":"18","author":"R Cowie","year":"2001","unstructured":"Cowie, R., et al.: Emotion recognition in human-computer interaction. IEEE Signal Process. Mag. 18(1), 32\u201380 (2001)","journal-title":"IEEE Signal Process. Mag."},{"key":"16_CR9","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota (Jun 2019)"},{"issue":"3","key":"16_CR10","doi-asserted-by":"publisher","first-page":"572","DOI":"10.1016\/j.patcog.2010.09.020","volume":"44","author":"M El Ayadi","year":"2011","unstructured":"El Ayadi, M., Kamel, M.S., Karray, F.: Survey on speech emotion recognition: Features, classification schemes, and databases. Pattern Recogn. 44(3), 572\u2013587 (2011)","journal-title":"Pattern Recogn."},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"Etienne, C., Fidanza, G., Petrovskii, A., Devillers, L., Schmauch, B.: CNN+LSTM architecture for Speech Emotion Recognition with Data Augmentation. In: Proceedings of Workshop on Speech, Music and Mind (SMM 2018), pp. 21\u201325 (2018)","DOI":"10.21437\/SMM.2018-5"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Georgescu, M.I., Ionescu, R.T., Ristea, N.C., Sebe, N.: Non-linear neurons with human-like apical dendrite activations. arXiv preprint arXiv:2003.03229 (2020)","DOI":"10.36227\/techrxiv.11830761.v1"},{"key":"16_CR13","unstructured":"Haq, S., Jackson, P., Edge, J.: Audio-visual feature selection and reduction for emotion classification. In: Proceedings of International Conference on Auditory-Visual Speech Processing (AVSP 2008), Tangalooma, Australia (Sept 2008)"},{"key":"16_CR14","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Iyer, S., Glackin, C., Cannings, N., Veneziano, V., Sun, Y.: A comparison between convolutional and transformer architectures for speech emotion recognition. In: 2022 International Joint Conference on Neural Networks (IJCNN), pp.\u00a01\u20138. IEEE (2022)","DOI":"10.1109\/IJCNN55064.2022.9891882"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Kahn, J., et\u00a0al.: Libri-light: a benchmark for asr with limited or no supervision. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 7669\u20137673. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"16_CR17","doi-asserted-by":"publisher","first-page":"117327","DOI":"10.1109\/ACCESS.2019.2936124","volume":"7","author":"RA Khalil","year":"2019","unstructured":"Khalil, R.A., Jones, E., Babar, M.I., Jan, T., Zafar, M.H., Alhussain, T.: Speech emotion recognition using deep learning techniques: A review. IEEE Access 7, 117327\u2013117345 (2019)","journal-title":"IEEE Access"},{"key":"16_CR18","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1007\/s10772-011-9125-1","volume":"15","author":"SG Koolagudi","year":"2012","unstructured":"Koolagudi, S.G., Rao, K.S.: Emotion recognition from speech: a review. Int. J. Speech Technol. 15, 99\u2013117 (2012)","journal-title":"Int. J. Speech Technol."},{"issue":"5","key":"16_CR19","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone, S.R., Russo, F.A.: The ryerson audio-visual database of emotional speech and song (ravdess): a dynamic, multimodal set of facial and vocal expressions in north american english. PLoS ONE 13(5), e0196391 (2018)","journal-title":"PLoS ONE"},{"issue":"22","key":"16_CR20","doi-asserted-by":"publisher","first-page":"7665","DOI":"10.3390\/s21227665","volume":"21","author":"C Luna-Jim\u00e9nez","year":"2021","unstructured":"Luna-Jim\u00e9nez, C., Griol, D., Callejas, Z., Kleinlein, R., Montero, J.M., Fern\u00e1ndez-Mart\u00ednez, F.: Multimodal emotion recognition on ravdess dataset using transfer learning. Sensors 21(22), 7665 (2021)","journal-title":"Sensors"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an asr corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210. IEEE (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"16_CR22","doi-asserted-by":"crossref","unstructured":"Pandey, S.K., Shekhawat, H.S., Prasanna, S.M.: Deep learning techniques for speech emotion recognition: a review. In: 2019 29th International Conference Radioelektronika (RADIOELEKTRONIKA), pp.\u00a01\u20136. IEEE (2019)","DOI":"10.1109\/RADIOELEK.2019.8733432"},{"key":"16_CR23","unstructured":"Pichora-Fuller, M.K., Dupuis, K.: Toronto emotional speech set (TESS) (2020)"},{"key":"16_CR24","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. arXiv preprint arXiv:2212.04356 (2022)"},{"issue":"5","key":"16_CR25","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1145\/3129340","volume":"61","author":"BW Schuller","year":"2018","unstructured":"Schuller, B.W.: Speech emotion recognition: two decades in a nutshell, benchmarks, and ongoing trends. Commun. ACM 61(5), 90\u201399 (2018)","journal-title":"Commun. ACM"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Trabelsi, A., Frasson, C.: The emotional machine: a machine learning approach to online prediction of user\u2019s emotion and intensity. In: 2010 10th IEEE International Conference on Advanced Learning Technologies, pp. 613\u2013617. IEEE (2010)","DOI":"10.1109\/ICALT.2010.174"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Wagner, J., Triantafyllopoulos, A., Wierstorf, H., Schmitt, M., Eyben, F., Schuller, B.W.: Dawn of the transformer era in speech emotion recognition: closing the valence gap. arXiv preprint arXiv:2203.07378 (2022)","DOI":"10.1109\/TPAMI.2023.3263585"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Ye, J., Wen, X., Wei, Y., Xu, Y., Liu, K., Shan, H.: Temporal modeling matters: A novel temporal emotional modeling approach for speech emotion recognition. arXiv preprint arXiv:2211.08233 (2022)","DOI":"10.1109\/ICASSP49357.2023.10096370"}],"container-title":["Lecture Notes in Computer Science","Artificial Intelligence and Soft Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-84353-2_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T13:47:37Z","timestamp":1740491257000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-84353-2_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031843525","9783031843532"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-84353-2_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"17 February 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICAISC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Intelligence and Soft Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Zakopane","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Poland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 June 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 June 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icaisc2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icaisc.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}