{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:41:53Z","timestamp":1767339713879,"version":"3.41.0"},"publisher-location":"Cham","reference-count":81,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031915741","type":"print"},{"value":"9783031915758","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-91575-8_13","type":"book-chapter","created":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T17:57:10Z","timestamp":1748195830000},"page":"204-222","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["VATE: A Large Scale Multimodal Spontaneous Dataset for\u00a0Affective Evaluation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5218-1118","authenticated-orcid":false,"given":"Francesco","family":"Agnelli","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9274-4047","authenticated-orcid":false,"given":"Giuliano","family":"Grossi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8210-4457","authenticated-orcid":false,"given":"Alessandro","family":"D\u2019Amelio","sequence":"additional","affiliation":[]},{"given":"Marco","family":"De Paoli","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8534-4413","authenticated-orcid":false,"given":"Raffaella","family":"Lanzarotti","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"13_CR1","doi-asserted-by":"crossref","unstructured":"Adoma, A.F., Henry, N.M., Chen, W.: Comparative analyses of Bert, Roberta, Distilbert, and Xlnet for text-based emotion recognition. In: 2020 17th International Computer Conference on Wavelet Active Media Technology and Information Processing (ICCWAMTIP), pp. 117\u2013121. IEEE (2020)","DOI":"10.1109\/ICCWAMTIP51612.2020.9317379"},{"key":"13_CR2","first-page":"24206","volume":"34","author":"H Akbari","year":"2021","unstructured":"Akbari, H., et al.: VATT: transformers for multimodal self-supervised learning from raw video, audio and text. Adv. Neural. Inf. Process. Syst. 34, 24206\u201324221 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"13_CR3","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: ViViT: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"issue":"1","key":"13_CR4","doi-asserted-by":"publisher","first-page":"20284","DOI":"10.1038\/s41598-020-77117-8","volume":"10","author":"B Azari","year":"2020","unstructured":"Azari, B., et al.: Comparing supervised and unsupervised approaches to emotion categorization in the human brain, body, and subjective experience. Sci. Rep. 10(1), 20284 (2020)","journal-title":"Sci. Rep."},{"issue":"1","key":"13_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1177\/1529100619832930","volume":"20","author":"LF Barrett","year":"2019","unstructured":"Barrett, L.F., Adolphs, R., Marsella, S., Martinez, A.M., Pollak, S.D.: Emotional expressions reconsidered: challenges to inferring emotion from human facial movements. Psychol. Sci. Public Interest 20(1), 1\u201368 (2019)","journal-title":"Psychol. Sci. Public Interest"},{"key":"13_CR6","doi-asserted-by":"crossref","unstructured":"Barros, P., Churamani, N., Lakomkin, E., Siqueira, H., Sutherland, A., Wermter, S.: The omg-emotion behavior dataset. In: 2018 International Joint Conference on Neural Networks (IJCNN), pp.\u00a01\u20137. IEEE (2018)","DOI":"10.1109\/IJCNN.2018.8489099"},{"key":"13_CR7","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1016\/j.patrec.2015.03.005","volume":"66","author":"S Bilakhia","year":"2015","unstructured":"Bilakhia, S., Petridis, S., Nijholt, A., Pantic, M.: The MAHNOB mimicry database: a database of naturalistic human interactions. Pattern Recogn. Lett. 66, 52\u201361 (2015)","journal-title":"Pattern Recogn. Lett."},{"key":"13_CR8","doi-asserted-by":"crossref","unstructured":"Boccignone, G., Conte, D., Cuculo, V., Lanzarotti, R.: AMHUSE: a multimodal dataset for humour sensing. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction, pp. 438\u2013445 (2017)","DOI":"10.1145\/3136755.3136806"},{"key":"13_CR9","doi-asserted-by":"crossref","unstructured":"Boccignone, G., et\u00a0al.: Stairway to elders: bridging space, time and emotions in their social environment for wellbeing. In: ICPRAM 2020-Proceedings of the 9th International Conference on Pattern Recognition Applications and Methods, pp. 548\u2013554. SciTePress (2020)","DOI":"10.5220\/0009106605480554"},{"key":"13_CR10","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso, C., et al.: IEMOCAP: interactive emotional dyadic motion capture database. Lang. Resour. Eval. 42, 335\u2013359 (2008). https:\/\/doi.org\/10.1007\/s10579-008-9076-6","journal-title":"Lang. Resour. Eval."},{"key":"13_CR11","doi-asserted-by":"publisher","first-page":"593","DOI":"10.1016\/j.ins.2021.10.005","volume":"582","author":"FZ Canal","year":"2022","unstructured":"Canal, F.Z., et al.: A survey on facial emotion recognition techniques: a state-of-the-art literature review. Inf. Sci. 582, 593\u2013617 (2022)","journal-title":"Inf. Sci."},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"13_CR13","doi-asserted-by":"crossref","unstructured":"Chen, C., Hou, N., Hu, Y., Zou, H., Qi, X., Chng, E.S.: Interactive audio-text representation for automated audio captioning with contrastive learning. arXiv preprint arXiv:2203.15526 (2022)","DOI":"10.21437\/Interspeech.2022-10510"},{"key":"13_CR14","doi-asserted-by":"crossref","unstructured":"Chen, M., Wang, S., Liang, P.P., Baltru\u0161aitis, T., Zadeh, A., Morency, L.P.: Multimodal sentiment analysis with word-level fusion and reinforcement learning. In: Proceedings of the 19th ACM International Conference on Multimodal Interaction, pp. 163\u2013171 (2017)","DOI":"10.1145\/3136755.3136801"},{"key":"13_CR15","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"13_CR16","unstructured":"Corbin, A.: derm-ita. https:\/\/pypi.org\/project\/derm-ita\/"},{"issue":"1","key":"13_CR17","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1109\/TAFFC.2021.3053275","volume":"14","author":"J Deng","year":"2021","unstructured":"Deng, J., Ren, F.: A survey of textual emotion recognition and its challenges. IEEE Trans. Affect. Comput. 14(1), 49\u201367 (2021)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"13_CR18","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"13_CR19","doi-asserted-by":"crossref","unstructured":"Ding, H., Zhou, S.K., Chellappa, R.: FaceNet2ExpNet: regularizing a deep face recognition net for expression recognition. In: 2017 12th IEEE International Conference on Automatic Face & Gesture Recognition (FG 2017), pp. 118\u2013126. IEEE (2017)","DOI":"10.1109\/FG.2017.23"},{"issue":"6","key":"13_CR20","doi-asserted-by":"publisher","first-page":"2885","DOI":"10.3390\/s23062885","volume":"23","author":"A D\u2019Amelio","year":"2023","unstructured":"D\u2019Amelio, A., Patania, S., Bur\u0161i\u0107, S., Cuculo, V., Boccignone, G.: Inferring causal factors of core affect dynamics on social participation through the lens of the observer. Sensors 23(6), 2885 (2023)","journal-title":"Sensors"},{"key":"13_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1155\/2010\/263593","volume":"2010","author":"F Eyben","year":"2010","unstructured":"Eyben, F., et al.: Emotion on the road: necessity, acceptance, and feasibility of affective computing in the car. Adv. Hum. Comput. Interact. 2010, 1\u201317 (2010)","journal-title":"Adv. Hum. Comput. Interact."},{"key":"13_CR22","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2020.102951","volume":"110","author":"MS Fahad","year":"2021","unstructured":"Fahad, M.S., Ranjan, A., Yadav, J., Deepak, A.: A survey of speech emotion recognition in natural environment. Digital Sig. Process. 110, 102951 (2021)","journal-title":"Digital Sig. Process."},{"key":"13_CR23","doi-asserted-by":"crossref","unstructured":"Felbo, B., Mislove, A., S\u00f8gaard, A., Rahwan, I., Lehmann, S.: Using millions of emoji occurrences to learn any-domain representations for detecting sentiment, emotion and sarcasm. arXiv preprint arXiv:1708.00524 (2017)","DOI":"10.18653\/v1\/D17-1169"},{"key":"13_CR24","doi-asserted-by":"crossref","unstructured":"Grimm, M., Kroschel, K., Narayanan, S.: The Vera am Mittag German audio-visual emotional speech database. In: 2008 IEEE International Conference on Multimedia and Expo, pp. 865\u2013868. IEEE (2008)","DOI":"10.1109\/ICME.2008.4607572"},{"key":"13_CR25","doi-asserted-by":"crossref","unstructured":"Guzhov, A., Raue, F., Hees, J., Dengel, A.: AudioClip: extending clip to image, text and audio. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 976\u2013980. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"13_CR26","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"13_CR27","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1007\/s11036-016-0685-9","volume":"21","author":"MS Hossain","year":"2016","unstructured":"Hossain, M.S., Muhammad, G., Alhamid, M.F., Song, B., Al-Mutib, K.: Audio-visual emotion recognition using big data towards 5G. Mob. Netw. Appl. 21, 753\u2013763 (2016)","journal-title":"Mob. Netw. Appl."},{"key":"13_CR28","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: HuBERT: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"1","key":"13_CR29","doi-asserted-by":"publisher","first-page":"2","DOI":"10.3390\/technologies9010002","volume":"9","author":"A Jaiswal","year":"2020","unstructured":"Jaiswal, A., Babu, A.R., Zadeh, M.Z., Banerjee, D., Makedon, F.: A survey on contrastive self-supervised learning. Technologies 9(1), 2 (2020)","journal-title":"Technologies"},{"issue":"2","key":"13_CR30","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1007\/s12193-015-0195-2","volume":"10","author":"SE Kahou","year":"2015","unstructured":"Kahou, S.E., et al.: EmoNets: Multimodal deep learning approaches for emotion recognition in video. J. Multimodal User Interfaces 10(2), 99\u2013111 (2015). https:\/\/doi.org\/10.1007\/s12193-015-0195-2","journal-title":"J. Multimodal User Interfaces"},{"key":"13_CR31","unstructured":"Kataoka, H., Wakamiya, T., Hara, K., Satoh, Y.: Would mega-scale datasets further enhance spatiotemporal 3D CNNs? arXiv preprint arXiv:2004.04968 (2020)"},{"key":"13_CR32","doi-asserted-by":"crossref","unstructured":"Kim, Y., Lee, H., Provost, E.M.: Deep learning for robust feature generation in audiovisual emotion recognition. In: 2013 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 3687\u20133691. IEEE (2013)","DOI":"10.1109\/ICASSP.2013.6638346"},{"key":"13_CR33","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1016\/j.imavis.2017.02.001","volume":"65","author":"J Kossaifi","year":"2017","unstructured":"Kossaifi, J., Tzimiropoulos, G., Todorovic, S., Pantic, M.: AFEW-VA database for valence and arousal estimation in-the-wild. Image Vis. Comput. 65, 23\u201336 (2017)","journal-title":"Image Vis. Comput."},{"issue":"3","key":"13_CR34","doi-asserted-by":"publisher","first-page":"1022","DOI":"10.1109\/TPAMI.2019.2944808","volume":"43","author":"J Kossaifi","year":"2019","unstructured":"Kossaifi, J., et al.: SEWA DB: a rich database for audio-visual emotion and sentiment research in the wild. IEEE Trans. Pattern Anal. Mach. Intell. 43(3), 1022\u20131040 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"13_CR35","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: ClipBERT for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"issue":"6","key":"13_CR36","doi-asserted-by":"publisher","first-page":"913","DOI":"10.1007\/s12652-016-0406-z","volume":"8","author":"Y Li","year":"2016","unstructured":"Li, Y., Tao, J., Chao, L., Bao, W., Liu, Y.: CHEAVD: a Chinese natural emotional audio\u2013Visual database. J. Ambient Intell. Human. Comput. 8(6), 913\u2013924 (2016). https:\/\/doi.org\/10.1007\/s12652-016-0406-z","journal-title":"J. Ambient Intell. Human. Comput."},{"issue":"5","key":"13_CR37","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone, S.R., Russo, F.A.: The Ryerson audio-visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in north American English. PLoS ONE 13(5), e0196391 (2018)","journal-title":"PLoS ONE"},{"key":"13_CR38","unstructured":"Lugaresi, C., et\u00a0al.: MediaPipe: a framework for building perception pipelines. arXiv preprint arXiv:1906.08172 (2019)"},{"issue":"22","key":"13_CR39","doi-asserted-by":"publisher","first-page":"7665","DOI":"10.3390\/s21227665","volume":"21","author":"C Luna-Jim\u00e9nez","year":"2021","unstructured":"Luna-Jim\u00e9nez, C., Griol, D., Callejas, Z., Kleinlein, R., Montero, J.M., Fern\u00e1ndez-Mart\u00ednez, F.: Multimodal emotion recognition on RAVDESS dataset using transfer learning. Sensors 21(22), 7665 (2021)","journal-title":"Sensors"},{"issue":"1","key":"13_CR40","doi-asserted-by":"publisher","first-page":"327","DOI":"10.3390\/app12010327","volume":"12","author":"C Luna-Jim\u00e9nez","year":"2021","unstructured":"Luna-Jim\u00e9nez, C., Kleinlein, R., Griol, D., Callejas, Z., Montero, J.M., Fern\u00e1ndez-Mart\u00ednez, F.: A proposal for multimodal emotion recognition using aural transformers and action units on RAVDESS dataset. Appl. Sci. 12(1), 327 (2021)","journal-title":"Appl. Sci."},{"key":"13_CR41","unstructured":"Ma, S., Zeng, Z., McDuff, D., Song, Y.: Active contrastive learning of audio-visual video representations. arXiv preprint arXiv:2009.09805 (2020)"},{"key":"13_CR42","doi-asserted-by":"crossref","unstructured":"Martin, O., Kotsia, I., Macq, B., Pitas, I.: The eNTERFACE\u201905 audio-visual emotion database. In: 22nd International Conference on Data Engineering Workshops (ICDEW 2006), p.\u00a08. IEEE (2006)","DOI":"10.1109\/ICDEW.2006.145"},{"key":"13_CR43","doi-asserted-by":"crossref","unstructured":"McDuff, D., Kaliouby, R., Senechal, T., Amr, M., Cohn, J., Picard, R.: Affectiva-MIT facial expression dataset (AM-FED): naturalistic and spontaneous facial expressions collected. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 881\u2013888 (2013)","DOI":"10.1109\/CVPRW.2013.130"},{"key":"13_CR44","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"13_CR45","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"527","DOI":"10.1007\/978-3-319-46448-0_32","volume-title":"Computer Vision \u2013 ECCV 2016","author":"I Misra","year":"2016","unstructured":"Misra, I., Zitnick, C.L., Hebert, M.: Shuffle and learn: unsupervised learning using temporal order verification. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 527\u2013544. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_32"},{"key":"13_CR46","doi-asserted-by":"crossref","unstructured":"Mollahosseini, A., Chan, D., Mahoor, M.H.: Going deeper in facial expression recognition using deep neural networks. In: 2016 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 1\u201310. IEEE (2016)","DOI":"10.1109\/WACV.2016.7477450"},{"key":"13_CR47","doi-asserted-by":"crossref","unstructured":"Ng, H.W., Nguyen, V.D., Vonikakis, V., Winkler, S.: Deep learning for emotion recognition on small datasets using transfer learning. In: Proceedings of the 2015 ACM on International Conference on Multimodal Interaction, pp. 443\u2013449 (2015)","DOI":"10.1145\/2818346.2830593"},{"key":"13_CR48","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"13_CR49","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.126866","volume":"561","author":"B Pan","year":"2023","unstructured":"Pan, B., Hirota, K., Jia, Z., Dai, Y.: A review of multimodal emotion recognition from datasets, preprocessing, features, and fusion methods. Neurocomputing 561, 126866 (2023)","journal-title":"Neurocomputing"},{"key":"13_CR50","doi-asserted-by":"crossref","unstructured":"Pan, T., Song, Y., Yang, T., Jiang, W., Liu, W.: VideoMoCo: contrastive video representation learning with temporally adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11205\u201311214 (2021)","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"13_CR51","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"501","DOI":"10.1007\/978-3-319-99579-3_52","volume-title":"Speech and Computer","author":"O Perepelkina","year":"2018","unstructured":"Perepelkina, O., Kazimirova, E., Konstantinova, M.: RAMAS: Russian multimodal corpus of dyadic interaction for affective computing. In: Karpov, A., Jokisch, O., Potapova, R. (eds.) SPECOM 2018. LNCS (LNAI), vol. 11096, pp. 501\u2013510. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-319-99579-3_52"},{"key":"13_CR52","unstructured":"P\u00e9rez-Rosas, V., Mihalcea, R., Morency, L.P.: Utterance-level multimodal sentiment analysis. In: Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 973\u2013982 (2013)"},{"key":"13_CR53","doi-asserted-by":"crossref","unstructured":"Poria, S., Hazarika, D., Majumder, N., Naik, G., Cambria, E., Mihalcea, R.: MELD: a multimodal multi-party dataset for emotion recognition in conversations. arXiv preprint arXiv:1810.02508 (2018)","DOI":"10.18653\/v1\/P19-1050"},{"key":"13_CR54","doi-asserted-by":"crossref","unstructured":"Qian, R., et al.: Spatiotemporal contrastive video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6964\u20136974 (2021)","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"13_CR55","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"13_CR56","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"key":"13_CR57","doi-asserted-by":"crossref","unstructured":"Ranganathan, H., Chakraborty, S., Panchanathan, S.: Multimodal emotion recognition using deep learning architectures. In: 2016 IEEE Winter Conference on Applications of Computer Vision (WACV), pp.\u00a01\u20139. IEEE (2016)","DOI":"10.1109\/WACV.2016.7477679"},{"key":"13_CR58","doi-asserted-by":"crossref","unstructured":"Ringeval, F., Sonderegger, A., Sauer, J., Lalanne, D.: Introducing the RECOLA multimodal corpus of remote collaborative and affective interactions. In: 2013 10th IEEE International Conference and Workshops on Automatic Face and Gesture Recognition (FG), pp.\u00a01\u20138. IEEE (2013)","DOI":"10.1109\/FG.2013.6553805"},{"issue":"1","key":"13_CR59","first-page":"53","volume":"2","author":"A Saxena","year":"2020","unstructured":"Saxena, A., Khanna, A., Gupta, D.: Emotion recognition and detection methods: a comprehensive survey. J. Artif. Intell. Syst. 2(1), 53\u201379 (2020)","journal-title":"J. Artif. Intell. Syst."},{"key":"13_CR60","doi-asserted-by":"publisher","unstructured":"Serengil, S.I., Ozpinar, A.: Hyperextended lightface: a facial attribute analysis framework. In: 2021 International Conference on Engineering and Emerging Technologies (ICEET), pp.\u00a01\u20134. IEEE (2021). https:\/\/doi.org\/10.1109\/ICEET53442.2021.9659697, https:\/\/ieeexplore.ieee.org\/document\/9659697","DOI":"10.1109\/ICEET53442.2021.9659697"},{"issue":"2","key":"13_CR61","doi-asserted-by":"publisher","first-page":"781","DOI":"10.1109\/TAFFC.2020.2964549","volume":"13","author":"A Shukla","year":"2020","unstructured":"Shukla, A., Gullapuram, S.S., Katti, H., Kankanhalli, M., Winkler, S., Subramanian, R.: Recognition of advertisement emotions with application to computational advertising. IEEE Trans. Affect. Comput. 13(2), 781\u2013792 (2020)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"13_CR62","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7464\u20137473 (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"13_CR63","doi-asserted-by":"crossref","unstructured":"Suzuki, T., Itazuri, T., Hara, K., Kataoka, H.: Learning spatiotemporal 3D convolution with video order self-supervision. In: Proceedings of the European Conference on Computer Vision (ECCV) Workshops (2018)","DOI":"10.1007\/978-3-030-11012-3_45"},{"key":"13_CR64","doi-asserted-by":"crossref","unstructured":"Wang, H., Meghawat, A., Morency, L.P., Xing, E.P.: Select-additive learning: improving generalization in multimodal sentiment analysis. In: 2017 IEEE International Conference on Multimedia and Expo (ICME), pp. 949\u2013954. IEEE (2017)","DOI":"10.1109\/ICME.2017.8019301"},{"issue":"3","key":"13_CR65","doi-asserted-by":"publisher","first-page":"46","DOI":"10.1109\/MIS.2013.34","volume":"28","author":"M W\u00f6llmer","year":"2013","unstructured":"W\u00f6llmer, M., et al.: Youtube movie reviews: sentiment analysis in an audio-visual context. IEEE Intell. Syst. 28(3), 46\u201353 (2013)","journal-title":"IEEE Intell. Syst."},{"key":"13_CR66","doi-asserted-by":"crossref","unstructured":"Xu, D., Xiao, J., Zhao, Z., Shao, J., Xie, D., Zhuang, Y.: Self-supervised spatiotemporal learning via video clip order prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10334\u201310343 (2019)","DOI":"10.1109\/CVPR.2019.01058"},{"key":"13_CR67","doi-asserted-by":"publisher","DOI":"10.1016\/j.compedu.2019.103649","volume":"142","author":"E Yadegaridehkordi","year":"2019","unstructured":"Yadegaridehkordi, E., Noor, N., Ayub, M., Affal, H.B., Hussin, N.B.: Affective computing in education: a systematic review and future research. Comput. Educ. 142, 103649 (2019)","journal-title":"Comput. Educ."},{"key":"13_CR68","unstructured":"Yang, C., Xu, Y., Dai, B., Zhou, B.: Video representation learning with visual tempo consistency. arXiv preprint arXiv:2006.15489 (2020)"},{"key":"13_CR69","unstructured":"Yannakakis, G.N.: Enhancing health care via affective computing (2018)"},{"key":"13_CR70","unstructured":"YouTube: 70 people, ages 5 to 75 [video playlist]. https:\/\/www.youtube.com\/playlist?list=PL1TPHg7HzcUo1-ewKRu1zCoaDRwPoGxCh. Accessed 27 May 2024"},{"key":"13_CR71","unstructured":"YouTube: 70 people, ages 5 to 75 [video playlist]. https:\/\/www.youtube.com\/playlist?list=PLjrML-f5aMc6vkuhQo6w-m53yBDisz4nU. Accessed 27 May 2024"},{"key":"13_CR72","unstructured":"YouTube: 70 people, ages 5 to 75 [video playlist]. https:\/\/www.youtube.com\/playlist?list=PLJic7bfGlo3qxHqFNEADdFjp074mqebyx. Accessed 27 May 2024"},{"key":"13_CR73","doi-asserted-by":"crossref","unstructured":"Yu, W., et al.: CH-SIMS: a Chinese multimodal sentiment analysis dataset with fine-grained annotation of modality. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 3718\u20133727 (2020)","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"13_CR74","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Chen, M., Poria, S., Cambria, E., Morency, L.P.: Tensor fusion network for multimodal sentiment analysis. arXiv preprint arXiv:1707.07250 (2017)","DOI":"10.18653\/v1\/D17-1115"},{"key":"13_CR75","unstructured":"Zadeh, A.B., Liang, P.P., Poria, S., Cambria, E., Morency, L.P.: Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2236\u20132246 (2018)"},{"issue":"3","key":"13_CR76","doi-asserted-by":"publisher","first-page":"300","DOI":"10.1109\/TAFFC.2016.2553038","volume":"8","author":"S Zhalehpour","year":"2016","unstructured":"Zhalehpour, S., Onder, O., Akhtar, Z., Erdem, C.E.: Baum-1: a spontaneous audio-visual face database of affective and mental states. IEEE Trans. Affect. Comput. 8(3), 300\u2013313 (2016)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"13_CR77","doi-asserted-by":"crossref","unstructured":"Zhang, S., Yang, Y., Chen, C., Zhang, X., Leng, Q., Zhao, X.: Deep learning-based multimodal emotion recognition from audio, visual, and text modalities: a systematic review of recent advancements and future prospects. Expert Syst. Appl. 237, 121692 (2023)","DOI":"10.1016\/j.eswa.2023.121692"},{"issue":"10","key":"13_CR78","doi-asserted-by":"publisher","first-page":"3030","DOI":"10.1109\/TCSVT.2017.2719043","volume":"28","author":"S Zhang","year":"2017","unstructured":"Zhang, S., Zhang, S., Huang, T., Gao, W., Tian, Q.: Learning affective features with a hybrid deep model for audio-visual emotion recognition. IEEE Trans. Circuits Syst. Video Technol. 28(10), 3030\u20133043 (2017)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"13_CR79","unstructured":"Zhou, C., et\u00a0al.: A comprehensive survey on pretrained foundation models: a history from BERT to ChatGPT. arXiv preprint arXiv:2302.09419 (2023)"},{"key":"13_CR80","doi-asserted-by":"crossref","unstructured":"Zhuang, C., She, T., Andonian, A., Mark, M.S., Yamins, D.: Unsupervised learning from video with deep neural embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9563\u20139572 (2020)","DOI":"10.1109\/CVPR42600.2020.00958"},{"key":"13_CR81","doi-asserted-by":"crossref","unstructured":"Zong, Y., Mac\u00a0Aodha, O., Hospedales, T.: Self-supervised multimodal learning: a survey. arXiv preprint arXiv:2304.01008 (2023)","DOI":"10.1109\/TPAMI.2024.3429301"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-91575-8_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T17:57:30Z","timestamp":1748195850000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-91575-8_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031915741","9783031915758"],"references-count":81,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-91575-8_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}