{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T01:31:09Z","timestamp":1767317469988,"version":"3.48.0"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032101914","type":"print"},{"value":"9783032101921","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-10192-1_19","type":"book-chapter","created":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T01:28:58Z","timestamp":1767317338000},"page":"224-235","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Study on Multimodal Foundation Models for Affective Video Prediction"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5218-1118","authenticated-orcid":false,"given":"Francesco","family":"Agnelli","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7274-5615","authenticated-orcid":false,"given":"Mirko","family":"Ditroia","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2590-9133","authenticated-orcid":false,"given":"Giorgio","family":"Blandano","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8210-4457","authenticated-orcid":false,"given":"Alessandro","family":"D\u2019Amelio","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0653-8563","authenticated-orcid":false,"given":"Omar","family":"Ghezzi","sequence":"additional","affiliation":[]},{"given":"Marco","family":"De Paoli","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8534-4413","authenticated-orcid":false,"given":"Raffaella","family":"Lanzarotti","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,2]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Agnelli, F., Grossi, G., D\u2019Amelio, A., De\u00a0Paoli, M., Lanzarotti, R.: VATE: a large scale multimodal spontaneous dataset for affective evaluation. In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) Computer Vision\u2014ECCV 2024 Workshops. In press. Springer Nature Switzerland, Cham (2025)","DOI":"10.1007\/978-3-031-91575-8_13"},{"key":"19_CR2","unstructured":"Akbari, H., et al.: VATT: transformers for multimodal self-supervised learning from raw video, audio and text. Adv. Neural. Inf. Process. Syst. 34, 24206\u201324221 (2021)"},{"issue":"2","key":"19_CR3","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1109\/MIS.2023.3254179","volume":"38","author":"MM Amin","year":"2023","unstructured":"Amin, M.M., Cambria, E., Schuller, B.W.: Will affective computing emerge from foundation models and general artificial intelligence? A first evaluation of ChatGPT. IEEE Intell. Syst. 38(2), 15\u201323 (2023)","journal-title":"IEEE Intell. Syst."},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: ViViT: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Awais, M., et al.: Foundation models defining a new era in vision: a survey and outlook. IEEE Trans. Pattern Anal. Mach. Intell. (2025)","DOI":"10.1109\/TPAMI.2024.3506283"},{"key":"19_CR6","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv. Neural Inf. Process. Syst. 33, 12449\u201312460 (2020)"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Bao, F., Neumann, M., Vu, N.T.: CycleGAN-based emotion style transfer as data augmentation for speech emotion recognition. In: Interspeech, pp. 2828\u20132832 (2019)","DOI":"10.21437\/Interspeech.2019-2293"},{"key":"19_CR8","doi-asserted-by":"publisher","unstructured":"Chauhan, D.S., et al.: M2H2: a multimodal multiparty Hindi dataset for humor recognition in conversations. In: Proceedings of the 23rd ACM International Conference on Multimodal Interaction (ICMI), p.\u00a05. Association for Computing Machinery (2021). https:\/\/doi.org\/10.1145\/1122445.1122456. https:\/\/arxiv.org\/pdf\/2108.01260","DOI":"10.1145\/1122445.1122456"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Chen, H., et al.: An overview of domain-specific foundation model: key technologies, applications and challenges (2024). arXiv:2409.04267","DOI":"10.1007\/s11432-025-4498-2"},{"key":"19_CR10","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"issue":"1","key":"19_CR11","doi-asserted-by":"publisher","first-page":"49","DOI":"10.1109\/TAFFC.2021.3053275","volume":"14","author":"J Deng","year":"2021","unstructured":"Deng, J., Ren, F.: A survey of textual emotion recognition and its challenges. IEEE Trans. Affect. Comput. 14(1), 49\u201367 (2021)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"19_CR12","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding (2018). arXiv:1810.04805"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Felbo, B., Mislove, A., S\u00f8gaard, A., Rahwan, I., Lehmann, S.: Using millions of emoji occurrences to learn any-domain representations for detecting sentiment, emotion and sarcasm (2017). arXiv:1708.00524","DOI":"10.18653\/v1\/D17-1169"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Guzhov, A., Raue, F., Hees, J., Dengel, A.: AudioCLIP: extending clip to image, text and audio. In: ICASSP 2022\u20142022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 976\u2013980. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"19_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"19_CR16","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: HuBERT: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Le\u00a0Ngo, A.C., Phan, R.C.W., See, J.: Spontaneous subtle expression recognition: imbalanced databases and solutions. In: Computer Vision\u2014ACCV 2014: 12th Asian Conference on Computer Vision, Singapore, Singapore, November 1\u20135, 2014, Revised Selected Papers, Part IV 12, pp. 33\u201348. Springer (2015)","DOI":"10.1007\/978-3-319-16817-3_3"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: ClipBERT for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"issue":"5","key":"19_CR19","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone, S.R., Russo, F.A.: The Ryerson audio-visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in north American English. PLoS ONE 13(5), e0196391 (2018)","journal-title":"PLoS ONE"},{"key":"19_CR20","unstructured":"Ma, S., Zeng, Z., McDuff, D., Song, Y.: Active contrastive learning of audio-visual video representations (2020). arXiv:2009.09805"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Pan, B., Hirota, K., Jia, Z., Dai, Y.: A review of multimodal emotion recognition from datasets, preprocessing, features, and fusion methods. Neurocomputing 126866 (2023)","DOI":"10.1016\/j.neucom.2023.126866"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Pan, T., Song, Y., Yang, T., Jiang, W., Liu, W.: VideoMoCo: contrastive video representation learning with temporally adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11205\u201311214 (2021)","DOI":"10.1109\/CVPR46437.2021.01105"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Poria, S., Hazarika, D., Majumder, N., Naik, G., Cambria, E., Mihalcea, R.: MELD: a multimodal multi-party dataset for emotion recognition in conversations (2018). arXiv:1810.02508","DOI":"10.18653\/v1\/P19-1050"},{"key":"19_CR25","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"19_CR26","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"19_CR27","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Ranganathan, H., Chakraborty, S., Panchanathan, S.: Multimodal emotion recognition using deep learning architectures. In: 2016 IEEE Winter Conference on Applications of Computer Vision (WACV), pp.\u00a01\u20139. IEEE (2016)","DOI":"10.1109\/WACV.2016.7477679"},{"issue":"1","key":"19_CR29","first-page":"53","volume":"2","author":"A Saxena","year":"2020","unstructured":"Saxena, A., Khanna, A., Gupta, D.: Emotion recognition and detection methods: a comprehensive survey. J. Artif. Intell. Syst. 2(1), 53\u201379 (2020)","journal-title":"J. Artif. Intell. Syst."},{"key":"19_CR30","unstructured":"Schuller, B., et al.: Affective computing has changed: the foundation model disruption (2024). arXiv:2409.08907"},{"key":"19_CR31","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7464\u20137473 (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: A systematic review on affective computing: emotion models, databases, and recent advances. Inf. Fusion 83, 19\u201352 (2022)","DOI":"10.1016\/j.inffus.2022.03.009"},{"key":"19_CR33","unstructured":"Yang, C., Xu, Y., Dai, B., Zhou, B.: Video representation learning with visual tempo consistency (2020). arXiv:2006.15489"},{"key":"19_CR34","unstructured":"Yang, C., et al.: Survey on knowledge distillation for large language models: methods, evaluation, and application. ACM Trans. Intell. Syst. and Technol. (2024)"},{"key":"19_CR35","unstructured":"Yuan, Y.: On the power of foundation models. In: International Conference on Machine Learning, pp. 40519\u201340530. PMLR (2023)"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Chen, M., Poria, S., Cambria, E., Morency, L.P.: Tensor fusion network for multimodal sentiment analysis (2017). arXiv:1707.07250","DOI":"10.18653\/v1\/D17-1115"},{"key":"19_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, S., Yang, Y., Chen, C., Zhang, X., Leng, Q., Zhao, X.: Deep learning-based multimodal emotion recognition from audio, visual, and text modalities: a systematic review of recent advancements and future prospects. Expert Syst. Appl. 121692 (2023)","DOI":"10.1016\/j.eswa.2023.121692"},{"key":"19_CR38","doi-asserted-by":"crossref","unstructured":"Zhao, X., Poria, S., Li, X., Chen, Y., Tang, B.: Toward robust multimodal sentiment analysis using multimodal foundational models. Expert Syst. Appl. 126974 (2025)","DOI":"10.1016\/j.eswa.2025.126974"},{"key":"19_CR39","doi-asserted-by":"crossref","unstructured":"Zhuang, C., She, T., Andonian, A., Mark, M.S., Yamins, D.: Unsupervised learning from video with deep neural embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9563\u20139572 (2020)","DOI":"10.1109\/CVPR42600.2020.00958"}],"container-title":["Lecture Notes in Computer Science","Image Analysis and Processing \u2013 ICIAP 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-10192-1_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T01:29:02Z","timestamp":1767317342000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-10192-1_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032101914","9783032101921"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-10192-1_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"2 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIAP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Image Analysis and Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Rome","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iciap2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.iciap.org\/home","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}