{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:58:04Z","timestamp":1780934284514,"version":"3.54.1"},"reference-count":37,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100004826","name":"Beijing Natural Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100009546","name":"NCUT","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100009546","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.114018","type":"journal-article","created":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T16:14:42Z","timestamp":1779380082000},"page":"114018","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["Multimodal emotion recognition via large model guided dialogue state tracking with dynamic graph refinement"],"prefix":"10.1016","volume":"180","author":[{"given":"Qing","family":"Zhang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yu","family":"Sui","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Haoze","family":"Guo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5953-4566","authenticated-orcid":false,"given":"Jie","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianyong","family":"Duan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hao","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Li","family":"He","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Linqi","family":"Song","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guizhong","family":"Xu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.114018_b1","doi-asserted-by":"crossref","unstructured":"S. Poria, D. Hazarika, N. Majumder, G. Naik, E. Cambria, R. Mihalcea, Meld: A multimodal multi-party dataset for emotion recognition in conversations, in: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, 2019, pp. 527\u2013536.","DOI":"10.18653\/v1\/P19-1050"},{"issue":"23","key":"10.1016\/j.patcog.2026.114018_b2","doi-asserted-by":"crossref","first-page":"27327","DOI":"10.1007\/s11042-024-20227-6","article-title":"Multimodal emotion recognition based on a fusion of audiovisual information with temporal dynamics","volume":"84","author":"Salas-C\u00e1ceres","year":"2025","journal-title":"Multimedia Tools Appl."},{"key":"10.1016\/j.patcog.2026.114018_b3","series-title":"ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Exploring complementary features in multi-modal speech emotion recognition","author":"Wang","year":"2023"},{"key":"10.1016\/j.patcog.2026.114018_b4","doi-asserted-by":"crossref","unstructured":"D. Ghosal, N. Majumder, S. Poria, N. Chhaya, A. Gelbukh, Dialoguegcn: A graph convolutional neural network for emotion recognition in conversation, in: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP, 2019, pp. 154\u2013164.","DOI":"10.18653\/v1\/D19-1015"},{"key":"10.1016\/j.patcog.2026.114018_b5","doi-asserted-by":"crossref","unstructured":"W. Shen, S. Wu, Y. Yang, X. Quan, Directed acyclic graph network for conversational emotion recognition, in: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), 2021, pp. 1551\u20131560.","DOI":"10.18653\/v1\/2021.acl-long.123"},{"key":"10.1016\/j.patcog.2026.114018_b6","doi-asserted-by":"crossref","unstructured":"J. Hu, Y. Liu, J. Zhao, Q. Jin, MMGCN: Multimodal fusion via deep graph convolution network for emotion recognition in conversation, in: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), 2021, pp. 5666\u20135675.","DOI":"10.18653\/v1\/2021.acl-long.440"},{"issue":"22","key":"10.1016\/j.patcog.2026.114018_b7","doi-asserted-by":"crossref","first-page":"4714","DOI":"10.3390\/electronics12224714","article-title":"Emotion recognition in conversations: A survey focusing on context, speaker dependencies, and fusion methods","volume":"12","author":"Fu","year":"2023","journal-title":"Electronics"},{"key":"10.1016\/j.patcog.2026.114018_b8","doi-asserted-by":"crossref","unstructured":"C.-B. Nguyen, D.-T. Le, Q.T. Ha, et al., Curriculum learning meets directed acyclic graph for multimodal emotion recognition, in: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, LREC-COLING 2024, 2024, pp. 4259\u20134265.","DOI":"10.63317\/3eikm2yttbsc"},{"key":"10.1016\/j.patcog.2026.114018_b9","doi-asserted-by":"crossref","unstructured":"N. Majumder, S. Poria, D. Hazarika, R. Mihalcea, A. Gelbukh, E. Cambria, Dialoguernn: An attentive rnn for emotion detection in conversations, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 33, 2019, pp. 6818\u20136825.","DOI":"10.1609\/aaai.v33i01.33016818"},{"key":"10.1016\/j.patcog.2026.114018_b10","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2021","first-page":"2694","article-title":"DialogueTRM: Exploring multi-modal emotional dynamics in a conversation","author":"Mao","year":"2021"},{"key":"10.1016\/j.patcog.2026.114018_b11","doi-asserted-by":"crossref","unstructured":"W. Zheng, J. Yu, R. Xia, S. Wang, A facial expression-aware multimodal multi-task learning framework for emotion recognition in multi-party conversations, in: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 2023, pp. 15445\u201315459.","DOI":"10.18653\/v1\/2023.acl-long.861"},{"key":"10.1016\/j.patcog.2026.114018_b12","doi-asserted-by":"crossref","unstructured":"G. Tu, F. Xiong, B. Liang, H. Wang, X. Zeng, R. Xu, Multimodal emotion recognition calibration in conversations, in: Proceedings of the 32nd ACM International Conference on Multimedia, 2024, pp. 9621\u20139630.","DOI":"10.1145\/3664647.3681515"},{"key":"10.1016\/j.patcog.2026.114018_b13","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111340","article-title":"FrameERC: Framelet transform based multimodal graph neural networks for emotion recognition in conversation","volume":"161","author":"Li","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114018_b14","doi-asserted-by":"crossref","unstructured":"G. Hu, D. Kollias, X. Yang, Grounding Emotion Recognition with Visual Prototypes: VEGA-Revisiting CLIP in MERC, in: Proceedings of the 33rd ACM International Conference on Multimedia, 2025, pp. 5667\u20135676.","DOI":"10.1145\/3746027.3755340"},{"key":"10.1016\/j.patcog.2026.114018_b15","series-title":"2025 IEEE International Conference on Multimedia and Expo","first-page":"1","article-title":"A multi-stage and multi-target knowledge distillation framework for multimodal conversational emotion recognition","author":"Niu","year":"2025"},{"key":"10.1016\/j.patcog.2026.114018_b16","doi-asserted-by":"crossref","first-page":"110805","DOI":"10.52202\/079017-3518","article-title":"Emotion-llama: Multimodal emotion recognition and reasoning with instruction tuning","volume":"37","author":"Cheng","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114018_b17","doi-asserted-by":"crossref","unstructured":"V. Balaraman, S. Sheikhalishahi, B. Magnini, Recent neural methods on dialogue state tracking for task-oriented dialogue systems: A survey, in: Proceedings of the 22nd Annual Meeting of the Special Interest Group on Discourse and Dialogue, 2021, pp. 239\u2013251.","DOI":"10.18653\/v1\/2021.sigdial-1.25"},{"key":"10.1016\/j.patcog.2026.114018_b18","doi-asserted-by":"crossref","unstructured":"J. Cao, Y. Zhang, A comparative study on schema-guided dialogue state tracking, in: Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, 2021, pp. 782\u2013796.","DOI":"10.18653\/v1\/2021.naacl-main.62"},{"key":"10.1016\/j.patcog.2026.114018_b19","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111842","article-title":"Advanced dialogue state tracking with noetic graphs for complex human-machine interactions","volume":"168","author":"Li","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114018_b20","first-page":"30306","article-title":"Gslb: The graph structure learning benchmark","volume":"36","author":"Li","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114018_b21","series-title":"2025 IEEE International Conference on Multimedia and Expo","first-page":"1","article-title":"Multimodal emotion recognition in conversations via graph structure learning","author":"Xiong","year":"2025"},{"key":"10.1016\/j.patcog.2026.114018_b22","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"Enhanced multimodal emotion recognition in conversations via contextual filtering and multi-frequency graph propagation","author":"Zhao","year":"2025"},{"key":"10.1016\/j.patcog.2026.114018_b23","doi-asserted-by":"crossref","unstructured":"F. Wang, H. Ma, R. Xia, J. Yu, E. Cambria, Semeval-2024 task 3: Multimodal emotion cause analysis in conversations, in: Proceedings of the 18th International Workshop on Semantic Evaluation, SemEval-2024, 2024, pp. 2039\u20132050.","DOI":"10.18653\/v1\/2024.semeval-1.277"},{"key":"10.1016\/j.patcog.2026.114018_b24","doi-asserted-by":"crossref","unstructured":"J. Lee, Y. Wang, J. Li, M. Zhang, Multimodal reasoning with multimodal knowledge graph, in: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 2024, pp. 10767\u201310782.","DOI":"10.18653\/v1\/2024.acl-long.579"},{"key":"10.1016\/j.patcog.2026.114018_b25","doi-asserted-by":"crossref","unstructured":"Z. Han, B. Zhu, Y. Xu, P. Song, X. Yang, Benchmarking and bridging emotion conflicts for multimodal emotion reasoning, in: Proceedings of the 33rd ACM International Conference on Multimedia, 2025, pp. 5528\u20135537.","DOI":"10.1145\/3746027.3754856"},{"key":"10.1016\/j.patcog.2026.114018_b26","unstructured":"B. Xing, X. Liu, G. Zhao, C. Liu, X. Fu, H. K\u00e4lvi\u00e4inen, EmotionHallucer: Evaluating Emotion Hallucinations in Multimodal Large Language Models, in: International Conference on Learning Representations, ICLR, 2026."},{"key":"10.1016\/j.patcog.2026.114018_b27","series-title":"European Conference on Computer Vision","first-page":"348","article-title":"Multi-modal video dialogue state tracking in the wild","author":"Abdessaied","year":"2024"},{"key":"10.1016\/j.patcog.2026.114018_b28","doi-asserted-by":"crossref","unstructured":"Y.-H.H. Tsai, S. Bai, P.P. Liang, J.Z. Kolter, L.-P. Morency, R. Salakhutdinov, Multimodal transformer for unaligned multimodal language sequences, in: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, 2019, pp. 6558\u20136569.","DOI":"10.18653\/v1\/P19-1656"},{"key":"10.1016\/j.patcog.2026.114018_b29","doi-asserted-by":"crossref","unstructured":"R. Liu, H. Zuo, Z. Lian, H. Yuan, Q. Fan, Hardness-Aware Dynamic Curriculum Learning for Robust Multimodal Emotion Recognition with Missing Modalities, in: Proceedings of the 33rd ACM International Conference on Multimedia, 2025, pp. 5755\u20135764.","DOI":"10.1145\/3746027.3755605"},{"key":"10.1016\/j.patcog.2026.114018_b30","doi-asserted-by":"crossref","unstructured":"J. Zhao, T. Zhang, J. Hu, Y. Liu, Q. Jin, X. Wang, H. Li, M3ED: Multi-modal multi-scene multi-label emotional dialogue database, in: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 2022, pp. 5699\u20135710.","DOI":"10.18653\/v1\/2022.acl-long.391"},{"issue":"4","key":"10.1016\/j.patcog.2026.114018_b31","doi-asserted-by":"crossref","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","article-title":"IEMOCAP: Interactive emotional dyadic motion capture database","volume":"42","author":"Busso","year":"2008","journal-title":"Lang. Resour. Eval."},{"issue":"1","key":"10.1016\/j.patcog.2026.114018_b32","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/TAFFC.2017.2740923","article-title":"Affectnet: A database for facial expression, valence, and arousal computing in the wild","volume":"10","author":"Mollahosseini","year":"2017","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.patcog.2026.114018_b33","doi-asserted-by":"crossref","unstructured":"Z. Liu, Y. Lin, Y. Cao, H. Hu, Y. Wei, Z. Zhang, S. Lin, B. Guo, Swin transformer: Hierarchical vision transformer using shifted windows, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.patcog.2026.114018_b34","doi-asserted-by":"crossref","first-page":"776","DOI":"10.1109\/TMM.2023.3271019","article-title":"A transformer-based model with self-distillation for multimodal emotion recognition in conversations","volume":"26","author":"Ma","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.114018_b35","doi-asserted-by":"crossref","unstructured":"Z. Yi, Z. Zhao, Z. Shen, T. Zhang, Multimodal fusion via hypergraph autoencoder and contrastive learning for emotion recognition in conversation, in: Proceedings of the 32nd ACM International Conference on Multimedia, 2024, pp. 4341\u20134348.","DOI":"10.1145\/3664647.3681633"},{"key":"10.1016\/j.patcog.2026.114018_b36","doi-asserted-by":"crossref","unstructured":"T. Zhang, Z. Tan, ECERC: evidence-cause attention network for multi-modal emotion recognition in conversation, in: Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 2025, pp. 2064\u20132077.","DOI":"10.18653\/v1\/2025.acl-long.102"},{"key":"10.1016\/j.patcog.2026.114018_b37","series-title":"Findings of the Association for Computational Linguistics: NAACL 2024","first-page":"4521","article-title":"Emotion-anchored contrastive learning framework for emotion recognition in conversation","author":"Yu","year":"2024"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009830?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009830?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:02:51Z","timestamp":1780930971000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326009830"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":37,"alternative-id":["S0031320326009830"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114018","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multimodal emotion recognition via large model guided dialogue state tracking with dynamic graph refinement","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114018","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114018"}}