{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T09:56:59Z","timestamp":1781776619164,"version":"3.54.5"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032127662","type":"print"},{"value":"9783032127679","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-12767-9_13","type":"book-chapter","created":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T02:26:25Z","timestamp":1767320785000},"page":"110-120","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Emotion Recognition and Contextual Analysis in Therapy Sessions Using Video Large Language Models"],"prefix":"10.1007","author":[{"given":"Rabia","family":"Jafri","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Sushant","family":"Patil","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pranav","family":"Krishnakumar","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Syed Omar","family":"Ali","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Syed Abid","family":"Ali","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Syed Fawad","family":"Hussain","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,1,2]]},"reference":[{"issue":"1","key":"13_CR1","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1016\/j.ypsc.2022.05.006","volume":"2","author":"R Khanna","year":"2022","unstructured":"Khanna, R., Robinson, N., O\u2019Donnell, M., Eyre, H., Smith, E.: Affective computing in psychotherapy. Adv. Psychiatry Behav. Health. 2(1), 95\u2013105 (2022)","journal-title":"Adv. Psychiatry Behav. Health"},{"issue":"6","key":"13_CR2","doi-asserted-by":"publisher","DOI":"10.18280\/ts.410612","volume":"41","author":"Y Liu","year":"2024","unstructured":"Liu, Y., Zhang, Y., Wang, Y.: Application of deep learning-based image processing in emotion recognition and psychological therapy. Traitement Signal. 41(6), 2923 (2024)","journal-title":"Traitement Signal."},{"key":"13_CR3","doi-asserted-by":"publisher","first-page":"706","DOI":"10.1016\/j.procs.2022.12.187","volume":"216","author":"A Chowanda","year":"2023","unstructured":"Chowanda, A., Iswanto, I.A., Andangsari, E.W.: Exploring deep learning algorithm to model emotions recognition from speech. Procedia Comput. Sci. 216, 706\u2013713 (2023)","journal-title":"Procedia Comput. Sci."},{"key":"13_CR4","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127015","volume":"568","author":"SM George","year":"2024","unstructured":"George, S.M., Ilyas, P.M.: A review on speech emotion recognition: a survey, recent advances, challenges, and the influence of noise. Neurocomputing. 568, 127015 (2024)","journal-title":"Neurocomputing"},{"key":"13_CR5","doi-asserted-by":"publisher","DOI":"10.3389\/fcomp.2024.1359471","volume":"6","author":"JA Ballesteros","year":"2024","unstructured":"Ballesteros, J.A., Ram\u00edrez, V.G.M., Moreira, F., Solano, A., Pelaez, C.A.: Facial emotion recognition through artificial intelligence. Front. Comput. Sci. 6, 1359471 (2024)","journal-title":"Front. Comput. Sci."},{"key":"13_CR6","doi-asserted-by":"publisher","first-page":"103976","DOI":"10.1109\/ACCESS.2024.3430850","volume":"12","author":"S Kalateh","year":"2024","unstructured":"Kalateh, S., Estrada-Jimenez, L.A., Nikghadam-Hojjati, S., Barata, J.: A systematic review on multimodal emotion recognition: building blocks, current state, applications, and challenges. IEEE Access. 12, 103976\u2013104019 (2024). https:\/\/doi.org\/10.1109\/ACCESS.2024.3430850","journal-title":"IEEE Access"},{"key":"13_CR7","unstructured":"Lian, Z., Sun, H., Sun, L., Gu, H., Wen, Z., Zhang, S., Chen, S., Xu, M., Xu, K., Chen, K.: Explainable multimodal emotion recognition. arXiv preprint arXiv:230615401. (2023)"},{"key":"13_CR8","unstructured":"Li, F., Zhang, R., Zhang, H., Zhang, Y., Li, B., Li, W., Ma, Z., Li, C.: Llava-next-interleave: tackling multi-image, video, and 3d in large multimodal models. arXiv preprint arXiv:240707895. (2024)"},{"key":"13_CR9","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1145\/3689092.3689403","volume-title":"Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing","author":"Y Xu","year":"2024","unstructured":"Xu, Y., Zhou, Y., Cai, Y., Xie, J., Ye, R., Wu, Z.: Multimodal emotion captioning using large language model with prompt engineering. In: Proceedings of the 2nd International Workshop on Multimodal and Responsible Affective Computing, pp. 104\u2013109 (2024)"},{"issue":"7","key":"13_CR10","doi-asserted-by":"publisher","DOI":"10.3390\/fi16070247","volume":"16","author":"L Vaiani","year":"2024","unstructured":"Vaiani, L., Cagliero, L., Garza, P.: Emotion recognition from videos using multimodal large language models. Future Internet. 16(7), 247 (2024)","journal-title":"Future Internet"},{"key":"13_CR11","unstructured":"Yang, Q., Bai, D., Peng, Y.-X., Wei, X.: Omni-emotion: extending video MLLM with detailed face and audio modeling for multimodal emotion analysis. arXiv preprint arXiv:250109502. (2025)"},{"key":"13_CR12","doi-asserted-by":"publisher","first-page":"110805","DOI":"10.52202\/079017-3518","volume":"37","author":"Z Cheng","year":"2024","unstructured":"Cheng, Z., Cheng, Z.-Q., He, J.-Y., Wang, K., Lin, Y., Lian, Z., Peng, X., Hauptmann, A.: Emotion-llama: multimodal emotion recognition and reasoning with instruction tuning. Adv. Neural Inf. Proces. Syst. 37, 110805\u2013110853 (2024)","journal-title":"Adv. Neural Inf. Proces. Syst."},{"key":"13_CR13","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.S.: Video-chatgpt: towards detailed video understanding via large vision and language models. arXiv preprint arXiv:230605424. (2023)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"13_CR14","doi-asserted-by":"crossref","unstructured":"Lin, B., Ye, Y., Zhu, B., Cui, J., Ning, M., Jin, P., Yuan, L.: Video-llava: learning united visual representation by alignment before projection. arXiv preprint arXiv:231110122. (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"13_CR15","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1145\/3551876.3554806","volume-title":"Proceedings of the 3rd International on Multimodal Sentiment Analysis Workshop and Challenge, Lisboa, Portugal","author":"L Vaiani","year":"2022","unstructured":"Vaiani, L., Quatra, M.L., Cagliero, L., Garza, P.: ViPER: video-based perceiver for emotion recognition. In: Proceedings of the 3rd International on Multimodal Sentiment Analysis Workshop and Challenge, Lisboa, Portugal, pp. 67\u201373. Association for Computing Machinery (2022). https:\/\/doi.org\/10.1145\/3551876.3554806"},{"key":"13_CR16","unstructured":"Li, B., Zhang, Y., Guo, D., Zhang, R., Li, F., Zhang, H., Zhang, K., Zhang, P., Li, Y., Liu, Z.: Llava-onevision: easy visual task transfer. arXiv preprint arXiv:240803326. (2024)"},{"key":"13_CR17","doi-asserted-by":"publisher","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","volume":"29","author":"WN Hsu","year":"2021","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., Mohamed, A.: HuBERT: self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM Trans. Audio Speech Lang. Process. 29, 3451\u20133460 (2021). https:\/\/doi.org\/10.1109\/TASLP.2021.3122291","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"13_CR18","doi-asserted-by":"publisher","first-page":"6110","DOI":"10.1145\/3581783.3612365","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia, Ottawa ON, Canada","author":"L Sun","year":"2023","unstructured":"Sun, L., Lian, Z., Liu, B., Tao, J.: MAE-DFER: efficient masked autoencoder for self-supervised dynamic facial expression recognition. In: Proceedings of the 31st ACM International Conference on Multimedia, Ottawa ON, Canada, pp. 6110\u20136121. Association for Computing Machinery (2023). https:\/\/doi.org\/10.1145\/3581783.3612365"},{"key":"13_CR19","first-page":"10078","volume":"35","author":"Z Tong","year":"2022","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: Videomae: masked autoencoders are data-efficient learners for self-supervised video pre-training. Adv. Neural Inf. Proces. Syst. 35, 10078\u201310093 (2022)","journal-title":"Adv. Neural Inf. Proces. Syst."},{"key":"13_CR20","first-page":"19358","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Fang","year":"2023","unstructured":"Fang, Y., Wang, W., Xie, B., Sun, Q., Wu, L., Wang, X., Huang, T., Wang, X., Eva, C.Y.: Exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19358\u201319369 (2023)"},{"key":"13_CR21","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:230709288. (2023)"},{"issue":"4","key":"13_CR22","doi-asserted-by":"publisher","DOI":"10.3390\/info16040301","volume":"16","author":"A Rasool","year":"2025","unstructured":"Rasool, A., Aslam, S., Hussain, N., Imtiaz, S., Riaz, W.: nBERT: harnessing NLP for emotion recognition in psychotherapy to transform mental health care. Information. 16(4), 301 (2025)","journal-title":"Information"},{"issue":"1","key":"13_CR23","doi-asserted-by":"publisher","first-page":"190","DOI":"10.1016\/j.gltp.2022.03.008","volume":"3","author":"D Nixon","year":"2022","unstructured":"Nixon, D., Mallappa, V.V., Petli, V., HosgurMath, S., Kiran, K.S.: A novel AI therapy for depression counseling using face emotion techniques. Global Trans. Proc. 3(1), 190\u2013194 (2022). https:\/\/doi.org\/10.1016\/j.gltp.2022.03.008","journal-title":"Global Trans. Proc."},{"key":"13_CR24","first-page":"28492","volume-title":"Proceedings of the 40th International Conference on Machine Learning, Proceedings of Machine Learning Research","author":"A Radford","year":"2023","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., Mcleavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: Andreas, K., Emma, B., Kyunghyun, C., Barbara, E., Sivan, S., Jonathan, S. (eds.) Proceedings of the 40th International Conference on Machine Learning, Proceedings of Machine Learning Research, pp. 28492\u201328518. PMLR (2023)"},{"key":"13_CR25","doi-asserted-by":"publisher","first-page":"3222","DOI":"10.21437\/Interspeech.2023-205","volume-title":"Proceedings of Interspeech 2023","author":"A Plaquet","year":"2023","unstructured":"Plaquet, A., Bredin, H.: Powerset multi-class cross entropy loss for neural speaker diarization. In: Proceedings of Interspeech 2023, pp. 3222\u20133226 (2023). https:\/\/doi.org\/10.21437\/Interspeech.2023-205"},{"key":"13_CR26","unstructured":"Zhang, Y., Li, B., Liu, H., Lee, Y.J., Gui, L., Fu, D., Feng, J., Liu, Z., Li, C.: LLaVA-NeXT: A Strong Zero-Shot Video Understanding Model (2024). https:\/\/llava-vl.github.io\/blog\/2024-04-30-llava-next-video\/. Accessed June 11, 2025"},{"key":"13_CR27","unstructured":"Hugging Face \u2013 The AI community building the future. https:\/\/huggingface.co\/. Accessed June 16 2025"},{"key":"13_CR28","unstructured":"Li, M.: Michellejieli\/emotion_text_classifier (Hugging Face Model). Hugging Face (2023). https:\/\/huggingface.co\/michellejieli\/emotion_text_classifier. Accessed June 11 2025"},{"key":"13_CR29","unstructured":"Calabres, E.H.: wav2vec2-lg-xlsr-en-speech-emotion-recognition (Hugging Face Model). Hugging Face (2023). https:\/\/huggingface.co\/ehcalabres\/wav2vec2-lg-xlsr-en-speech-emotion-recognition. Accessed June 11 2025"},{"key":"13_CR30","unstructured":"Google Colaboratory. https:\/\/colab.research.google.com\/. Accessed June 16 2025"},{"key":"13_CR31","unstructured":"YouTube: YouTube (2025). https:\/\/www.youtube.com. Accessed June 14 2025"},{"key":"13_CR32","doi-asserted-by":"crossref","unstructured":"Min, S., Lyu, X., Holtzman, A., Artetxe, M., Lewis, M., Hajishirzi, H., Zettlemoyer, L.: Rethinking the role of demonstrations: What makes in-context learning work? arXiv preprint arXiv:220212837. (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.759"}],"container-title":["Communications in Computer and Information Science","HCI International 2025 \u2013 Late Breaking Posters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-12767-9_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T09:44:47Z","timestamp":1781775887000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-12767-9_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032127662","9783032127679"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-12767-9_13","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"value":"1865-0929","type":"print"},{"value":"1865-0937","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"2 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"HCII","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Human-Computer Interaction","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Gothenburg","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Sweden","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 June 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 June 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"hcii2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2025.hci.international\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}