{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T07:31:51Z","timestamp":1781163111541,"version":"3.54.1"},"reference-count":53,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100018554","name":"Science and Technology Program of Gansu Province","doi-asserted-by":"publisher","award":["26JRRA258"],"award-info":[{"award-number":["26JRRA258"]}],"id":[{"id":"10.13039\/501100018554","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["lzujbky-2023-16"],"award-info":[{"award-number":["lzujbky-2023-16"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013804","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013804","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100012899","name":"Lanzhou University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100012899","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62472203"],"award-info":[{"award-number":["62472203"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.patcog.2026.113586","type":"journal-article","created":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T00:09:20Z","timestamp":1774310960000},"page":"113586","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["EmoRAct: A neuro-symbolic framework coupling acoustic tokens with prosody semantics for emotion recognition"],"prefix":"10.1016","volume":"179","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7571-6439","authenticated-orcid":false,"given":"Minqiang","family":"Yang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8806-8723","authenticated-orcid":false,"given":"Hui","family":"Bai","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7402-4687","authenticated-orcid":false,"given":"Mingwen","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9712-9380","authenticated-orcid":false,"given":"Yongfeng","family":"Tao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2258-8530","authenticated-orcid":false,"given":"Changsheng","family":"Ma","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3514-5413","authenticated-orcid":false,"given":"Bin","family":"Hu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113586_bib0001","series-title":"Proceedings of the 24th ACM International Conference on Multimedia","first-page":"1365","article-title":"Video eCommerce: towards online video advertising","author":"Cheng","year":"2016"},{"issue":"6","key":"10.1016\/j.patcog.2026.113586_bib0002","doi-asserted-by":"crossref","first-page":"1170","DOI":"10.1109\/TMM.2016.2647386","article-title":"Video eCommerce++: toward large scale online video advertising","volume":"19","author":"Cheng","year":"2017","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.113586_bib0003","doi-asserted-by":"crossref","DOI":"10.1016\/j.jnca.2019.102423","article-title":"A survey of emotion recognition methods with emphasis on E-Learning environments","volume":"147","author":"Imani","year":"2019","journal-title":"J. Netw. Comput. Appl."},{"issue":"1","key":"10.1016\/j.patcog.2026.113586_bib0004","doi-asserted-by":"crossref","DOI":"10.18401\/2017.7.1.3","article-title":"Emotion recognition, emotion expression, and cultural display rules: implications for counseling","volume":"7","author":"Hutchison","year":"2017","journal-title":"J. Asia Pac. Couns."},{"issue":"20","key":"10.1016\/j.patcog.2026.113586_bib0005","doi-asserted-by":"crossref","first-page":"41297","DOI":"10.1109\/JIOT.2025.3575085","article-title":"Echoes of empathy: a symbiotic IoT-based emotion feedback framework for psychological interventions via large language model","volume":"12","author":"Yang","year":"2025","journal-title":"IEEE Internet Things J."},{"key":"10.1016\/j.patcog.2026.113586_bib0006","first-page":"1","article-title":"SguiNet: stimulus-guided depression detection network based on ocular images","author":"Yang","year":"2026","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"12","key":"10.1016\/j.patcog.2026.113586_bib0007","doi-asserted-by":"crossref","first-page":"1773","DOI":"10.1109\/JPROC.2025.3542324","article-title":"Digital phenotyping and feature extraction on smartphone data for depression detection","volume":"112","author":"Yang","year":"2024","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.patcog.2026.113586_bib0008","article-title":"InstructERC: reforming emotion recognition in conversation with a retrieval multi-task LLMs framework","author":"Lei","year":"2023","journal-title":"CoRR"},{"key":"10.1016\/j.patcog.2026.113586_bib0009","doi-asserted-by":"crossref","first-page":"3451","DOI":"10.1109\/TASLP.2021.3122291","article-title":"HuBERT: self-supervised speech representation learning by masked prediction of hidden units","volume":"29","author":"Hsu","year":"2021","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.patcog.2026.113586_bib0010","series-title":"Proceedings of the 32nd ACM International Conference on Multimedia","first-page":"9340","article-title":"Two in one go: single-stage emotion recognition with decoupled subject-context transformer","author":"Li","year":"2024"},{"issue":"3","key":"10.1016\/j.patcog.2026.113586_bib0011","doi-asserted-by":"crossref","first-page":"1802","DOI":"10.1109\/TAFFC.2025.3539225","article-title":"SDRS: sentiment-aware disentangled representation shifting for multimodal sentiment analysis","volume":"16","author":"Zhao","year":"2025","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"10","key":"10.1016\/j.patcog.2026.113586_bib0012","doi-asserted-by":"crossref","first-page":"6729","DOI":"10.1109\/TPAMI.2021.3094362","article-title":"Affective image content analysis: two decades review and new perspectives","volume":"44","author":"Zhao","year":"2021","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113586_bib0013","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110261","article-title":"EmoComicNet: a multi-task model for comic emotion recognition","volume":"150","author":"Dutta","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113586_bib0014","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111720","article-title":"Context transformer with multiscale fusion for robust facial emotion recognition","author":"Gan","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113586_bib0015","series-title":"Proceedings of the 2nd on Multimodal Sentiment Analysis Challenge","first-page":"15","article-title":"Multimodal emotion recognition and sentiment analysis via attention enhanced recurrent model","author":"Sun","year":"2021"},{"key":"10.1016\/j.patcog.2026.113586_bib0016","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"6818","article-title":"DialogueRNN: an attentive RNN for emotion detection in conversations","volume":"33","author":"Majumder","year":"2019"},{"key":"10.1016\/j.patcog.2026.113586_bib0017","series-title":"Proceedings of the Conference. Association for Computational Linguistics. Meeting","first-page":"6558","article-title":"Multimodal transformer for unaligned multimodal language sequences","volume":"2019","author":"Tsai","year":"2019"},{"key":"10.1016\/j.patcog.2026.113586_bib0018","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2023.111346","article-title":"TMBL: transformer-based multimodal binding learning model for multimodal sentiment analysis","volume":"285","author":"Huang","year":"2024","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.patcog.2026.113586_bib0019","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111340","article-title":"FrameERC: framelet transform based multimodal graph neural networks for emotion recognition in conversation","volume":"161","author":"Li","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113586_bib0020","doi-asserted-by":"crossref","DOI":"10.1016\/j.ins.2024.121393","article-title":"Multimodal graph learning with framelet-based stochastic configuration networks for emotion recognition in conversation","volume":"686","author":"Shi","year":"2025","journal-title":"Inf. Sci."},{"key":"10.1016\/j.patcog.2026.113586_bib0021","unstructured":"Y. Lei, D. Yang, Z. Chen, J. Chen, P. Zhai, L. Zhang, Large vision-language models as emotion recognizers in context awareness, (2024).arXiv preprint arXiv: 2407.11300."},{"key":"10.1016\/j.patcog.2026.113586_bib0022","doi-asserted-by":"crossref","first-page":"49250","DOI":"10.52202\/075280-2142","article-title":"InstructBLIP: towards general-purpose vision-language models with instruction tuning","volume":"36","author":"Dai","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"9","key":"10.1016\/j.patcog.2026.113586_bib0023","doi-asserted-by":"crossref","first-page":"6456","DOI":"10.1007\/s11263-025-02495-3","article-title":"StimuVAR: spatiotemporal stimuli-aware video affective reasoning with multimodal large language models","volume":"133","author":"Guo","year":"2025","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.patcog.2026.113586_bib0024","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110794","article-title":"A survey of dialogic emotion analysis: developments, approaches and perspectives","volume":"156","author":"Gan","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113586_bib0025","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.112122","article-title":"A systematic review of interpretability and explainability for speech emotion features in automatic speech emotion recognition","volume":"171","author":"Jayasinghe","year":"2026","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113586_bib0026","doi-asserted-by":"crossref","first-page":"110805","DOI":"10.52202\/079017-3518","article-title":"Emotion-LLaMA: multimodal emotion recognition and reasoning with instruction tuning","volume":"37","author":"Cheng","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113586_bib0027","unstructured":"Z. Lian, H. Chen, L. Chen, H. Sun, L. Sun, Y. Ren, Z. Cheng, B. Liu, R. Liu, X. Peng, et al., AffectGPT: a new dataset, model, and benchmark for emotion understanding with multimodal large language models, (2025).arXiv preprint arXiv: 2501.16566."},{"key":"10.1016\/j.patcog.2026.113586_bib0028","unstructured":"S. Bai, K. Chen, X. Liu, J. Wang, W. Ge, S. Song, K. Dang, P. Wang, S. Wang, J. Tang, et al., Qwen2.5-VL technical report, (2025).arXiv preprint arXiv: 2502.13923."},{"key":"10.1016\/j.patcog.2026.113586_bib0029","doi-asserted-by":"crossref","first-page":"123","DOI":"10.1016\/j.inffus.2022.10.009","article-title":"Dynamic interactive multiview memory network for emotion recognition in conversation","volume":"91","author":"Wen","year":"2023","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.patcog.2026.113586_bib0030","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2023.110285","article-title":"Hierarchically stacked graph convolution for emotion recognition in conversation","volume":"263","author":"Wang","year":"2023","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.patcog.2026.113586_bib0031","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111783","article-title":"Exp-VQA: fine-grained facial expression analysis via visual question answering","volume":"168","author":"Yuan","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113586_bib0032","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"26296","article-title":"Improved baselines with visual instruction tuning","author":"Liu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113586_bib0033","unstructured":"Y. Su, T. Lan, H. Li, J. Xu, Y. Wang, D. Cai, PandaGPT: one model to instruction-follow them all, (2023).arXiv preprint arXiv: 2305.16355."},{"key":"10.1016\/j.patcog.2026.113586_bib0034","series-title":"The Thirteenth International Conference on Learning Representations","article-title":"Towards semantic equivalence of tokenization in multimodal LLM","author":"Wu","year":"2025"},{"key":"10.1016\/j.patcog.2026.113586_bib0035","series-title":"European Conference on Computer Vision","first-page":"105","article-title":"Prompting visual-language models for efficient video understanding","author":"Ju","year":"2022"},{"key":"10.1016\/j.patcog.2026.113586_bib0036","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Adv. Neural Inf. Process Syst."},{"issue":"6","key":"10.1016\/j.patcog.2026.113586_bib0037","doi-asserted-by":"crossref","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","article-title":"WavLM: large-scale self-supervised pre-training for full stack speech processing","volume":"16","author":"Chen","year":"2022","journal-title":"IEEE J. Sel. Top Signal Process."},{"key":"10.1016\/j.patcog.2026.113586_bib0038","doi-asserted-by":"crossref","first-page":"495","DOI":"10.1109\/TASLP.2021.3129994","article-title":"SoundStream: an end-to-end neural audio codec","volume":"30","author":"Zeghidour","year":"2021","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.patcog.2026.113586_bib0039","article-title":"Neural discrete representation learning","volume":"30","author":"Van Den Oord","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113586_bib0040","doi-asserted-by":"crossref","first-page":"2523","DOI":"10.1109\/TASLP.2023.3288409","article-title":"AudioLM: a language modeling approach to audio generation","volume":"31","author":"Borsos","year":"2023","journal-title":"IEEE\/ACM Trans. Audio Speech. Lang. Process."},{"key":"10.1016\/j.patcog.2026.113586_bib0041","doi-asserted-by":"crossref","unstructured":"A. Ghosh, A. Acharya, S. Saha, V. Jain, A. Chadha, Exploring the frontier of vision-language models: a survey of current methodologies and future directions, (2024).arXiv preprint arXiv: 2404.07214.","DOI":"10.2139\/ssrn.4783140"},{"issue":"2","key":"10.1016\/j.patcog.2026.113586_bib0042","doi-asserted-by":"crossref","first-page":"142","DOI":"10.1109\/T-AFFC.2012.38","article-title":"Detecting depression severity from vocal prosody","volume":"4","author":"Yang","year":"2012","journal-title":"IEEE Trans. Affect. Comput"},{"key":"10.1016\/j.patcog.2026.113586_bib0043","series-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","first-page":"527","article-title":"MELD : a multimodal multi-party dataset for emotion recognition in conversations","author":"Poria","year":"2019"},{"key":"10.1016\/j.patcog.2026.113586_bib0044","series-title":"Proceedings of the 33rd ACM International Conference on Multimedia","first-page":"13837","article-title":"MER2025: when affective computing meets large language models","author":"Lian","year":"2025"},{"key":"10.1016\/j.patcog.2026.113586_bib0045","unstructured":"Y. Kim, Convolutional neural networks for sentence classification, (2014).arXiv preprint arXiv: 1408.5882."},{"key":"10.1016\/j.patcog.2026.113586_bib0046","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Memory fusion network for multi-view sequential learning","volume":"Vol. 32","author":"Zadeh","year":"2018"},{"key":"10.1016\/j.patcog.2026.113586_bib0047","series-title":"Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (volume 1: Long papers)","first-page":"873","article-title":"Context-dependent sentiment analysis in user-generated videos","author":"Poria","year":"2017"},{"key":"10.1016\/j.patcog.2026.113586_bib0048","series-title":"Proceedings of the Conference. Association for Computational Linguistics. North American Chapter. Meeting","first-page":"2122","article-title":"Conversational memory network for emotion recognition in dyadic dialogue videos","volume":"2018","author":"Hazarika","year":"2018"},{"key":"10.1016\/j.patcog.2026.113586_bib0049","series-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","first-page":"2594","article-title":"ICON: interactive conversational memory network for multimodal emotion detection","author":"Hazarika","year":"2018"},{"issue":"3","key":"10.1016\/j.patcog.2026.113586_bib0050","doi-asserted-by":"crossref","first-page":"1426","DOI":"10.1109\/TAFFC.2020.3005660","article-title":"Adapted dynamic memory network for emotion recognition in conversation","volume":"13","author":"Xing","year":"2020","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.patcog.2026.113586_bib0051","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.127550","article-title":"Multimodal knowledge-enhanced interactive network with mixed contrastive learning for emotion recognition in conversation","volume":"582","author":"Shen","year":"2024","journal-title":"Neurocomputing"},{"key":"10.1016\/j.patcog.2026.113586_bib0052","doi-asserted-by":"crossref","first-page":"77","DOI":"10.1109\/TMM.2023.3260635","article-title":"GraphCFC: a directed graph based cross-modal feature complementation approach for multimodal conversational emotion recognition","volume":"26","author":"Li","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.113586_bib0053","doi-asserted-by":"crossref","first-page":"4422","DOI":"10.1109\/TMM.2021.3117062","article-title":"LR-GCN: latent relation-aware graph convolutional network for conversational emotion recognition","volume":"24","author":"Ren","year":"2021","journal-title":"IEEE Trans. Multimed."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005522?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326005522?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T13:13:53Z","timestamp":1780492433000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326005522"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":53,"alternative-id":["S0031320326005522"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113586","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"EmoRAct: A neuro-symbolic framework coupling acoustic tokens with prosody semantics for emotion recognition","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113586","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113586"}}