{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T15:24:41Z","timestamp":1773415481747,"version":"3.50.1"},"reference-count":45,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFC0833200"],"award-info":[{"award-number":["2020YFC0833200"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1007\/s00530-024-01561-z","type":"journal-article","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T03:00:05Z","timestamp":1732158005000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["UCEMA: Uni-modal and cross-modal encoding network based on multi-head attention for emotion recognition in conversation"],"prefix":"10.1007","volume":"30","author":[{"given":"Hongkun","family":"Zhao","sequence":"first","affiliation":[]},{"given":"Siyuan","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Fanmin","family":"Kong","sequence":"additional","affiliation":[]},{"given":"Qingtian","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Kang","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,21]]},"reference":[{"key":"1561_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102306","volume":"106","author":"C Fan","year":"2024","unstructured":"Fan, C., Lin, J., Mao, R., Cambria, E.J.I.F.: Fusing pairwise modalities for emotion recognition in conversations. Inform. Fusion 106, 102306 (2024)","journal-title":"Inform. Fusion"},{"key":"1561_CR2","doi-asserted-by":"publisher","DOI":"10.1017\/S204579602400009X","volume":"33","author":"I Gorrino","year":"2024","unstructured":"Gorrino, I., et al.: A critical overview of emotion processing assessment in non-affective and affective psychoses. Epidemiol. Psychiatr. Sci. 33, e8 (2024)","journal-title":"Epidemiol. Psychiatr. Sci."},{"key":"1561_CR3","doi-asserted-by":"publisher","first-page":"3369","DOI":"10.1109\/TMM.2021.3097171","volume":"24","author":"J Wang","year":"2021","unstructured":"Wang, J., Bao, B.-K., Xu, C.J.I.T.O.M.: Dualvgr: a dual-visual graph reasoning unit for video question answering. IEEE Trans. Multimed. 24, 3369\u20133380 (2021)","journal-title":"IEEE Trans. Multimed."},{"issue":"1","key":"1561_CR4","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1109\/MMUL.2022.3217307","volume":"30","author":"P Li","year":"2022","unstructured":"Li, P., Tan, Z., Bao, B.-K.J.I.M.: Multiview language bias reduction for visual question answering. IEEE Multimedia 30(1), 91\u201399 (2022)","journal-title":"IEEE Multimedia"},{"issue":"7","key":"1561_CR5","doi-asserted-by":"publisher","first-page":"4873","DOI":"10.1007\/s10462-021-10030-2","volume":"54","author":"K Cortis","year":"2021","unstructured":"Cortis, K., Davis, B.J.A.I.R.: Over a decade of social opinion mining: a systematic review. Artif. Intell. Rev. 54(7), 4873\u20134965 (2021)","journal-title":"Artif. Intell. Rev."},{"key":"1561_CR6","doi-asserted-by":"publisher","first-page":"113043","DOI":"10.1109\/ACCESS.2019.2935155","volume":"7","author":"X Shao","year":"2019","unstructured":"Shao, X., Tang, G., Bao, B.-K.J.I.A.: Personalized travel recommendation based on sentiment-aware multimodal topic model. IEEE Access 7, 113043\u2013113052 (2019)","journal-title":"IEEE Access"},{"key":"1561_CR7","doi-asserted-by":"publisher","first-page":"4471","DOI":"10.1109\/TMM.2021.3118881","volume":"24","author":"W Nie","year":"2021","unstructured":"Nie, W., Chang, R., Ren, M., Su, Y.: I-GCN: Incremental graph convolution network for conversation emotion detection. IEEE Trans. Multimedia 24, 4471\u20134481 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"1561_CR8","doi-asserted-by":"crossref","unstructured":"Li, S., Yan, H., Qiu, X.: Contrast and generation make bart a good dialogue emotion recognizer. In: Proceedings of the AAAI Conference on Artificial Intelligence 36(10), 11002\u201311010 (2022)","DOI":"10.1609\/aaai.v36i10.21348"},{"key":"1561_CR9","doi-asserted-by":"publisher","first-page":"1803","DOI":"10.1109\/TASLP.2022.3171965","volume":"30","author":"W Fan","year":"2022","unstructured":"Fan, W., Xu, X., Cai, B., Xing, X.J.I.ATo.A., Speech,, and L. Processing: Isnet: individual standardization network for speech emotion recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 30, 1803\u20131814 (2022)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"4","key":"1561_CR10","doi-asserted-by":"publisher","first-page":"3164","DOI":"10.1109\/TAFFC.2022.3221749","volume":"14","author":"S Latif","year":"2022","unstructured":"Latif, S., Rana, R., Khalifa, S., Jurdak, R., Schuller, B.. W.J.ITo.A.C.: Multitask learning from augmented auxiliary data for improving speech emotion recognition. IEEE Trans. Affect. Comput. 14(4), 3164\u20133176 (2022)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"1561_CR11","doi-asserted-by":"crossref","unstructured":"Poria, S., Hazarika, D., Majumder, N., Naik, G., Cambria, E., Mihalcea, R.J.a.p.a.: Meld: a multimodal multi-party dataset for emotion recognition in conversations. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 527\u2013536, (2019)","DOI":"10.18653\/v1\/P19-1050"},{"key":"1561_CR12","doi-asserted-by":"crossref","unstructured":"Zhao, W., Zhao, Y., Lu, X.: CauAIN: causal aware interaction network for emotion recognition in conversations. In: Proceedings of the 31st International Joint Conference on Artificial Intelligence, pp. 4524\u20134530, (2022)","DOI":"10.24963\/ijcai.2022\/628"},{"key":"1561_CR13","doi-asserted-by":"crossref","unstructured":"Jiao, W., Yang, H., King, I., Lyu, M. R. J. a. p. a.: Higru: Hierarchical gated recurrent units for utterance-level emotion recognition. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologies. Minneapolis, pp. 397\u2013406, (2019)","DOI":"10.18653\/v1\/N19-1037"},{"issue":"3","key":"1561_CR14","doi-asserted-by":"publisher","first-page":"2276","DOI":"10.1109\/TAFFC.2022.3172360","volume":"14","author":"S Mai","year":"2022","unstructured":"Mai, S., Zeng, Y., Zheng, S., Hu, C.: Hybrid contrastive learning of tri-modal representation for multimodal sentiment analysis. IEEE Trans. Affect. Comput. 14(3), 2276\u20132289 (2022)","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"5","key":"1561_CR15","doi-asserted-by":"publisher","first-page":"2455","DOI":"10.3390\/s23052455","volume":"23","author":"Y Cai","year":"2023","unstructured":"Cai, Y., Li, X., Li, J.J.S.: Emotion recognition using different sensors, emotion models, methods and datasets: A comprehensive review. Sensors 23(5), 2455 (2023)","journal-title":"Sensors"},{"key":"1561_CR16","doi-asserted-by":"crossref","unstructured":"Hazarika, D., Poria, S., Mihalcea, R., Cambria, E., Zimmermann, R.: Icon: interactive conversational memory network for multimodal emotion detection. In: Proceedings of the 2018 conference on empirical methods in natural language processing, pp. 2594-2604, (2018)","DOI":"10.18653\/v1\/D18-1280"},{"key":"1561_CR17","doi-asserted-by":"crossref","unstructured":"Majumder, N., Poria, S., Hazarika, D., Mihalcea, R., Gelbukh, A., Cambria, E.: Dialoguernn: an attentive rnn for emotion detection in conversations. Proceedings of the AAAI Conference on Artificial Intelligence 33(01), 6818\u20136825 (2019)","DOI":"10.1609\/aaai.v33i01.33016818"},{"key":"1561_CR18","doi-asserted-by":"crossref","unstructured":"Ghosal, D., Majumder, N., Poria, S., Chhaya, N., Gelbukh, A.J.a.p.a.: Dialoguegcn: a graph convolutional neural network for emotion recognition in conversation. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp. 154-164, (2019)","DOI":"10.18653\/v1\/D19-1015"},{"key":"1561_CR19","doi-asserted-by":"crossref","unstructured":"Hu, J., Liu, Y., Zhao, J., Jin, Q.J.a.p.a.: MMGCN: Multimodal fusion via deep graph convolution network for emotion recognition in conversation. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics and the International Joint Conference on Natural Language Processing. Association for Computational Linguistics, pp. 5666-5675, (2021)","DOI":"10.18653\/v1\/2021.acl-long.440"},{"key":"1561_CR20","doi-asserted-by":"crossref","unstructured":"Hu, D., Hou, X., Wei, L., Jiang, L., Mo, Y.: MM-DFN: multimodal dynamic fusion network for emotion recognition in conversations. In: ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 7037-7041, (2022)","DOI":"10.1109\/ICASSP43922.2022.9747397"},{"key":"1561_CR21","doi-asserted-by":"publisher","first-page":"2325","DOI":"10.1109\/TASLP.2023.3284509","volume":"31","author":"L Yuan","year":"2023","unstructured":"Yuan, L., et al.: Rba-gcn: relational bilevel aggregation graph convolutional network for emotion recognition. IEEE\/ACM Trans. Audio Speech Lang. Process. 31, 2325\u20132337 (2023)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"1561_CR22","doi-asserted-by":"crossref","unstructured":"Yao, B., Shi, W.: Speaker-centric multimodal fusion networks for emotion recognition in conversations. In: ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing, pp. 8441\u20138445,2 024","DOI":"10.1109\/ICASSP48485.2024.10447720"},{"key":"1561_CR23","doi-asserted-by":"crossref","unstructured":"Shen, W., Chen, J., Quan, X., Xie, Z.: Dialogxl: all-in-one xlnet for multi-party conversation emotion recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence 35(15), 13789\u201313797 (2021)","DOI":"10.1609\/aaai.v35i15.17625"},{"key":"1561_CR24","doi-asserted-by":"crossref","unstructured":"Ghosal, D., Majumder, N., Gelbukh, A., Mihalcea, R., Poria, S.J.a.p.a.: Cosmic: commonsense knowledge for emotion identification in conversations. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 2470-2481, (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.224"},{"issue":"3","key":"1561_CR25","doi-asserted-by":"publisher","first-page":"2209","DOI":"10.1109\/TAFFC.2022.3178231","volume":"14","author":"Y Sun","year":"2022","unstructured":"Sun, Y., Mai, S., Hu, H.: Learning to learn better unimodal representations via adaptive multimodal meta-learning. IEEE Trans. Affect. Comput. 14(3), 2209\u20132223 (2022)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"1561_CR26","doi-asserted-by":"crossref","unstructured":"Poria, S., Cambria, E., Hazarika, D., Majumder, N., Zadeh, A., Morency, L.-P.: Context-dependent sentiment analysis in user-generated videos. In: Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics, pp. 873-883, (2017)","DOI":"10.18653\/v1\/P17-1081"},{"key":"1561_CR27","doi-asserted-by":"crossref","unstructured":"Hu, D., Bao, Y., Wei, L., Zhou, W., Hu, S.J.a.p.a.: Supervised adversarial contrastive learning for emotion recognition in conversations. In: Rogers A, Boyd-Graber JL, Okazaki N (eds) Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, ACL, pp 10835-10852, (2023)","DOI":"10.18653\/v1\/2023.acl-long.606"},{"issue":"5","key":"1561_CR28","doi-asserted-by":"publisher","first-page":"699","DOI":"10.1109\/TAI.2022.3149234","volume":"3","author":"G Tu","year":"2022","unstructured":"Tu, G., Wen, J., Liu, C., Jiang, D., Cambria, E.J.ITo.A.I.: Context-and sentiment-aware networks for emotion recognition in conversation. IEEE Trans. Artif. Intell. 3(5), 699\u2013708 (2022)","journal-title":"IEEE Trans. Artif. Intell."},{"key":"1561_CR29","doi-asserted-by":"crossref","unstructured":"Zadeh, SA., Liang, P. P., Mazumder, N., Poria, S., Cambria, E., Morency, L.-P.: Memory fusion network for multi-view sequential learning. In: Proceedings of the AAAI conference on artificial intelligence, vol. 32, no. 1, (2018)","DOI":"10.1609\/aaai.v32i1.12021"},{"issue":"1","key":"1561_CR30","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1109\/TNN.2008.2005605","volume":"20","author":"F Scarselli","year":"2008","unstructured":"Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, GJItonn.: The graph neural network model. IEEE Trans. Neural Netw. 20(1), 61\u201380 (2008)","journal-title":"IEEE Trans. Neural Netw."},{"key":"1561_CR31","doi-asserted-by":"crossref","unstructured":"Li, J., Wang, X., Liu, Y., Zeng, Z.J.I.T.o.A.C.: CFN-ESA: a cross-modal fusion network with emotion-shift awareness for dialogue emotion recognition. IEEE Trans. Affect. Comput. (2024)","DOI":"10.1109\/TAFFC.2024.3389453"},{"key":"1561_CR32","doi-asserted-by":"crossref","unstructured":"Tao, M., Bao, B.-K., Tang, H., Xu, C.:Galip: generative adversarial clips for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14214\u201314223, (2023)","DOI":"10.1109\/CVPR52729.2023.01366"},{"key":"1561_CR33","doi-asserted-by":"crossref","unstructured":"Luong, M.-T.J.a.p.a.: Effective approaches to attention-based neural machine translation. In: Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, (2015)","DOI":"10.18653\/v1\/D15-1166"},{"key":"1561_CR34","unstructured":"Vaswani, A.J.A.i.N.I.P.S.: Attention is all you need. In: Advances in neural information processing systems, vol. 30, (2017)"},{"key":"1561_CR35","unstructured":"Devlin, J.J.a.p.a.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Annual Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, (2019)"},{"key":"1561_CR36","unstructured":"Dosovitskiy, A.J.a.p.a.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations, (2021)"},{"key":"1561_CR37","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: European conference on computer vision, pp. 213\u2013229, (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"1561_CR38","doi-asserted-by":"crossref","unstructured":"Gulati, A., et al.: Conformer: convolution-augmented transformer for speech recognition. arXiv:2005.08100,2020","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"1561_CR39","doi-asserted-by":"crossref","unstructured":"Pan, B., Hirota, K., Jia, Z., Dai, Y.J.N.: A review of multimodal emotion recognition from datasets, preprocessing, features, and fusion methods. Neurocomputing, p. 126866, (2023)","DOI":"10.1016\/j.neucom.2023.126866"},{"key":"1561_CR40","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1016\/j.neunet.2022.09.022","volume":"156","author":"J Lei","year":"2022","unstructured":"Lei, J., Zhu, X., Wang, Y.J.N.N.: BAT: block and token self-attention for speech emotion recognition. Neural Netw. 156, 67\u201380 (2022)","journal-title":"Neural Netw."},{"key":"1561_CR41","doi-asserted-by":"publisher","first-page":"695","DOI":"10.1109\/TASLP.2022.3145287","volume":"30","author":"Y Zhou","year":"2022","unstructured":"Zhou, Y., Liang, X., Gu, Y., Yin, Y., Yao, LJIAtoa., speech,, and l. processing: Multi-classifier interactive learning for ambiguous speech emotion recognition. IEEE\/ACM Trans. Audio Speech Lang. process. 30, 695\u2013705 (2022)","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. process."},{"key":"1561_CR42","doi-asserted-by":"crossref","unstructured":"Graves, A., Graves, A.J.S.s.l.w.r.n.n.: Long short-term memory. In: Supervised sequence labelling with recurrent neural networks, pp. 37\u201345, (2012)","DOI":"10.1007\/978-3-642-24797-2_4"},{"key":"1561_CR43","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso, C., et al.: IEMOCAP: interactive emotional dyadic motion capture database. Lang. Resour. Eval. 42, 335\u2013359 (2008)","journal-title":"Lang. Resour. Eval."},{"key":"1561_CR44","doi-asserted-by":"crossref","unstructured":"Eyben, F., W\u00f6llmer, M., Schuller, B.: Opensmile: the Munich versatile and fast open-source audio feature extractor. In: Proceedings of the 18th ACM International Conference on Multimedia, pp. 1459-1462, 2010","DOI":"10.1145\/1873951.1874246"},{"key":"1561_CR45","doi-asserted-by":"crossref","unstructured":"Li, Z., Tang, F., Zhao, M., Zhu, Y.J.a.p.a.: EmoCaps: emotion capsule based model for conversational emotion recognition. In: Findings of the Association for Computational Linguistics, ACL 2022, pp.1610-1618, (2022)","DOI":"10.18653\/v1\/2022.findings-acl.126"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01561-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-024-01561-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01561-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,16]],"date-time":"2024-12-16T09:17:01Z","timestamp":1734340621000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-024-01561-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,21]]},"references-count":45,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2024,12]]}},"alternative-id":["1561"],"URL":"https:\/\/doi.org\/10.1007\/s00530-024-01561-z","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,21]]},"assertion":[{"value":"13 May 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 November 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 November 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"352"}}