{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T23:15:20Z","timestamp":1780442120995,"version":"3.54.1"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T00:00:00Z","timestamp":1756944000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T00:00:00Z","timestamp":1756944000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62472059"],"award-info":[{"award-number":["62472059"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Chongqing Talent Plan Project","award":["CSTC2024YCJH-BGZXM0022"],"award-info":[{"award-number":["CSTC2024YCJH-BGZXM0022"]}]},{"name":"Science and Technology Innovation Key R&D Program of Chongqing","award":["CSTB2024TIAD-STX0027"],"award-info":[{"award-number":["CSTB2024TIAD-STX0027"]}]},{"name":"Open Research Fund of Key Laboratory of Cyberspace Big Data Intelligent Security (Chongqing University of Posts and Telecommunications), Ministry of Education","award":["CBDIS202403"],"award-info":[{"award-number":["CBDIS202403"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Intell Inf Syst"],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s10844-025-00975-z","type":"journal-article","created":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T09:10:21Z","timestamp":1756977021000},"page":"2057-2077","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Towards robust multimodal emotion recognition in conversation with multi-modal transformer and variational distillation fusion"],"prefix":"10.1007","volume":"63","author":[{"given":"Xiaofei","family":"Zhu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuming","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,9,4]]},"reference":[{"issue":"11","key":"975_CR1","doi-asserted-by":"publisher","first-page":"11418","DOI":"10.1609\/aaai.v39i11.33242","volume":"39","author":"W Ai","year":"2025","unstructured":"Ai, W., Zhang, F., Shou, Y., et al. (2025). Revisiting multimodal emotion recognition in conversation from the perspective of graph spectrum. Proceedings of the AAAI Conference on Artificial Intelligence, 39(11), 11418\u201311426. https:\/\/doi.org\/10.1609\/aaai.v39i11.33242","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"issue":"4","key":"975_CR2","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/S10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso, C., Bulut, M., Lee, C., et al. (2008). IEMOCAP: interactive emotional dyadic motion capture database. Lang Resour Evaluation, 42(4), 335\u2013359. https:\/\/doi.org\/10.1007\/S10579-008-9076-6","journal-title":"Lang Resour Evaluation"},{"key":"975_CR3","doi-asserted-by":"publisher","unstructured":"Cho, K., van Merri\u00ebnboer ,B., Gulcehre, C., et\u00a0al. (2014). Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp 1724\u20131734, https:\/\/doi.org\/10.3115\/v1\/D14-1179","DOI":"10.3115\/v1\/D14-1179"},{"key":"975_CR4","doi-asserted-by":"publisher","unstructured":"Czerwinski, M., Gilad-Bachrach, R., Iqbal, S., et\u00a0al. (2016). Challenges for designing notifications for affective computing systems. In: Proceedings of the 2016 ACM international joint conference on pervasive and ubiquitous computing: adjunct, pp. 1554\u20131559. https:\/\/doi.org\/10.1145\/2968219.2968548","DOI":"10.1145\/2968219.2968548"},{"key":"975_CR5","doi-asserted-by":"publisher","unstructured":"Eyben, F., W\u00f6llmer, M., Schuller, B.W. (2010). Opensmile: the munich versatile and fast open-source audio feature extractor. In: Proceedings of the 18th international conference on multimedia, pp. 1459\u20131462, https:\/\/doi.org\/10.1145\/1873951.1874246","DOI":"10.1145\/1873951.1874246"},{"key":"975_CR6","doi-asserted-by":"publisher","unstructured":"Gan, X., Huang, X., Zou, S. (2025) Intentional tendency-based dynamic heterogeneous graph network for emotion recognition in conversations. Journal of Intelligent Information System pp. 1\u201322. https:\/\/doi.org\/10.1007\/s10844-025-00925-9","DOI":"10.1007\/s10844-025-00925-9"},{"key":"975_CR7","doi-asserted-by":"publisher","unstructured":"Ghosal, D., Majumder, N., Poria, S., et al. (2019). DialogueGCN: A graph convolutional neural network for emotion recognition in conversation. In: Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 154\u2013164. https:\/\/doi.org\/10.18653\/v1\/D19-1015","DOI":"10.18653\/v1\/D19-1015"},{"key":"975_CR8","doi-asserted-by":"publisher","unstructured":"Guo, Z., Jin, T., Zhao, Z. (2024). Multimodal prompt learning with missing modalities for sentiment analysis and emotion recognition. In: Ku LW, Martins A, Srikumar V (eds) Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, pp 1726\u20131736, https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.94","DOI":"10.18653\/v1\/2024.acl-long.94"},{"key":"975_CR9","doi-asserted-by":"publisher","unstructured":"Hu, D., Hou, X., Wei, L., et al. (2022). Mm-dfn: Multimodal dynamic fusion network for emotion recognition in conversations. In: ICASSP 2022 - 2022 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp. 7037\u20137041. https:\/\/doi.org\/10.1109\/ICASSP43922.2022.9747397","DOI":"10.1109\/ICASSP43922.2022.9747397"},{"key":"975_CR10","doi-asserted-by":"publisher","unstructured":"Hu, J., Liu, Y., Zhao, J., et\u00a0al. (2021). MMGCN: Multimodal fusion via deep graph convolution network for emotion recognition in conversation. In: Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing, pp. 5666\u20135675. https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.440","DOI":"10.18653\/v1\/2021.acl-long.440"},{"key":"975_CR11","doi-asserted-by":"publisher","unstructured":"Iandola, F., Moskewicz, M., Karayev, S., et\u00a0al. (2014). Densenet: Implementing efficient convnet descriptor pyramids. https:\/\/doi.org\/10.48550\/arXiv.1404.1869","DOI":"10.48550\/arXiv.1404.1869"},{"key":"975_CR12","doi-asserted-by":"publisher","unstructured":"Lei, Y., Yang, D., Li, M., et\u00a0al. (2023). Text-oriented modality reinforcement network for multimodal sentiment analysis from unaligned multimodal sequences. In: CICAI (2), pp. 189\u2013200. https:\/\/doi.org\/10.1007\/978-981-99-9119-8_18","DOI":"10.1007\/978-981-99-9119-8_18"},{"key":"975_CR13","doi-asserted-by":"publisher","unstructured":"Li, D., Wang, Y., Funakoshi, K., et\u00a0al. (2023). Joyful: Joint modality fusion and graph contrastive learning for multimodal emotion recognition. CoRR. https:\/\/doi.org\/10.48550\/ARXIV.2311.11009","DOI":"10.48550\/ARXIV.2311.11009"},{"key":"975_CR14","doi-asserted-by":"publisher","first-page":"985","DOI":"10.1109\/TASLP.2021.3049898","volume":"29","author":"Z Lian","year":"2021","unstructured":"Lian, Z., Liu, B., & Tao, J. (2021). Ctnet: Conversational transformer network for emotion recognition. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 29, 985\u20131000. https:\/\/doi.org\/10.1109\/TASLP.2021.3049898","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"975_CR15","doi-asserted-by":"publisher","unstructured":"Liu, Y., Ott, M., Goyal, N., et\u00a0al. (2019). Roberta: A robustly optimized BERT pretraining approach. https:\/\/doi.org\/10.48550\/arXiv.1907.11692","DOI":"10.48550\/arXiv.1907.11692"},{"issue":"2","key":"975_CR16","doi-asserted-by":"publisher","first-page":"1438","DOI":"10.1609\/aaai.v39i2.32134","volume":"39","author":"YK Liu","year":"2025","unstructured":"Liu, Y. K., Cai, J., Lu, B. L., et al. (2025). Multi-to-single: Reducing multimodal dependency in emotion recognition through contrastive learning. Proceedings of the AAAI Conference on Artificial Intelligence, 39(2), 1438\u20131446. https:\/\/doi.org\/10.1609\/aaai.v39i2.32134","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"975_CR17","doi-asserted-by":"publisher","first-page":"776","DOI":"10.1109\/TMM.2023.3271019","volume":"26","author":"H Ma","year":"2024","unstructured":"Ma, H., Wang, J., Lin, H., et al. (2024). A transformer-based model with self-distillation for multimodal emotion recognition in conversations. IEEE Transactions on Multimedia, 26, 776\u2013788. https:\/\/doi.org\/10.1109\/TMM.2023.3271019","journal-title":"IEEE Transactions on Multimedia"},{"key":"975_CR18","doi-asserted-by":"publisher","unstructured":"Majumder, N., Poria, S., Hazarika, D., et\u00a0al. (2019). Dialoguernn: An attentive rnn for emotion detection in conversations. In: Proceedings of the AAAI conference on artificial intelligence, pp 6818\u20136825.https:\/\/doi.org\/10.1609\/AAAI.V33I01.33016818","DOI":"10.1609\/AAAI.V33I01.33016818"},{"key":"975_CR19","doi-asserted-by":"publisher","first-page":"4298","DOI":"10.1109\/TASLP.2024.3434495","volume":"32","author":"T Meng","year":"2024","unstructured":"Meng, T., Zhang, F., Shou, Y., et al. (2024). Masked graph learning with recurrent alignment for multimodal emotion recognition in conversation. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 32, 4298\u20134312. https:\/\/doi.org\/10.1109\/TASLP.2024.3434495","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"},{"key":"975_CR20","unstructured":"Nguyen, C.V.T., Nguyen, C.B., Le, D.T., et\u00a0al .(2024a). Curriculum learning meets directed acyclic graph for multimodal emotion recognition. In: Calzolari N, Kan MY, Hoste V, et\u00a0al (eds) Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pp. 4259\u20134265"},{"key":"975_CR21","unstructured":"Nguyen, C.V.T., Nguyen, C.B., Le, D.T., et\u00a0al .(2024b). Curriculum learning meets directed acyclic graph for multimodal emotion recognition. In: Proceedings of the 2024 joint international conference on computational linguistics, language resources and evaluation (LREC-COLING 2024), pp 4259\u20134265"},{"key":"975_CR22","doi-asserted-by":"publisher","unstructured":"Pham, H., Liang, P.P., Manzini, T., et\u00a0al. (2019). Found in translation: Learning robust joint representations by cyclic translations between modalities. In: AAAI, pp. 6892\u20136899.https:\/\/doi.org\/10.1609\/AAAI.V33I01.33016892","DOI":"10.1609\/AAAI.V33I01.33016892"},{"key":"975_CR23","doi-asserted-by":"publisher","unstructured":"Poria, S., Hazarika, D., Majumder, N., et\u00a0al. (2019). MELD: A multimodal multi-party dataset for emotion recognition in conversations. In: Proceedings of the 57th annual meeting of the association for computational linguistics, pp. 527\u2013536, https:\/\/doi.org\/10.18653\/v1\/P19-1050","DOI":"10.18653\/v1\/P19-1050"},{"key":"975_CR24","doi-asserted-by":"publisher","unstructured":"Pujol, F.A., Mora, H., Mart\u00ednez, A. (2019). Emotion recognition to improve e-healthcare systems in smart cities. In: Research & Innovation Forum 2019 - Technology, Innovation, Education, and their Social Impact, RIIFORUM 2019, Rome, Italy, April 24-26, 2019, pp. 245\u2013254. https:\/\/doi.org\/10.1007\/978-3-030-30809-4_23","DOI":"10.1007\/978-3-030-30809-4_23"},{"key":"975_CR25","doi-asserted-by":"publisher","unstructured":"Shen, W., Wu, S., Yang, Y., et\u00a0al. (2021). Directed acyclic graph network for conversational emotion recognition. In: Proceedings of the 59th annual meeting of the association for computational linguistics and the 11th international joint conference on natural language processing (Volume 1: Long Papers), pp. 1551\u20131560. https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.123","DOI":"10.18653\/v1\/2021.acl-long.123"},{"key":"975_CR26","doi-asserted-by":"publisher","first-page":"9008","DOI":"10.1109\/TMM.2024.3384678","volume":"26","author":"Z Song","year":"2024","unstructured":"Song, Z., Hu, Z., Zhou, Y., et al. (2024). Embedded heterogeneous attention transformer for cross-lingual image captioning. IEEE Transactions on Multimedia, 26, 9008\u20139020. https:\/\/doi.org\/10.1109\/TMM.2024.3384678","journal-title":"IEEE Transactions on Multimedia"},{"key":"975_CR27","doi-asserted-by":"publisher","unstructured":"Tsai, Y.H.H., Bai, S., Liang, P.P., et\u00a0al. (2019). Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the conference. Association for computational linguistics. Meeting, p. 6558, https:\/\/doi.org\/10.18653\/V1\/P19-1656","DOI":"10.18653\/V1\/P19-1656"},{"key":"975_CR28","doi-asserted-by":"publisher","unstructured":"Tu, G., Wang, J., Li, Z., et\u00a0al. (2024a). Multiple knowledge-enhanced interactive graph network for multimodal conversational emotion recognition. In: Findings of the association for computational linguistics: EMNLP 2024, pp 3861\u20133874, https:\/\/doi.org\/10.18653\/v1\/2024.findings-emnlp.222","DOI":"10.18653\/v1\/2024.findings-emnlp.222"},{"issue":"17","key":"975_CR29","doi-asserted-by":"publisher","first-page":"19089","DOI":"10.1609\/aaai.v38i17.29876","volume":"38","author":"G Tu","year":"2024","unstructured":"Tu, G., Xie, T., Liang, B., et al. (2024). Adaptive graph learning for multimodal conversational emotion detection. Proceedings of the AAAI Conference on Artificial Intelligence, 38(17), 19089\u201319097. https:\/\/doi.org\/10.1609\/aaai.v38i17.29876","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"975_CR30","doi-asserted-by":"publisher","unstructured":"Wang, P., Zhou, Q., Wu, Y., et\u00a0al. (2024). DLF: disentangled-language-focused multimodal sentiment analysis. CoRR. https:\/\/doi.org\/10.48550\/ARXIV.2412.12225","DOI":"10.48550\/ARXIV.2412.12225"},{"key":"975_CR31","doi-asserted-by":"publisher","unstructured":"Wang, Y., Cui, Z., Li, Y. (2023). Distribution-consistent modal recovering for incomplete multimodal learning. In: 2023 IEEE\/CVF international conference on computer vision (ICCV), pp. 21968\u201321977.https:\/\/doi.org\/10.1109\/ICCV51070.2023.02013","DOI":"10.1109\/ICCV51070.2023.02013"},{"issue":"2","key":"975_CR32","doi-asserted-by":"publisher","first-page":"1574","DOI":"10.1609\/aaai.v39i2.32149","volume":"39","author":"Y Wang","year":"2025","unstructured":"Wang, Y., Fang, X., Yin, H., et al. (2025). Big-fusion: Brain-inspired global-local context fusion framework for multimodal emotion recognition in conversations. Proceedings of the AAAI Conference on Artificial Intelligence, 39(2), 1574\u20131582. https:\/\/doi.org\/10.1609\/aaai.v39i2.32149","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"issue":"2","key":"975_CR33","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1007\/S10844-024-00879-4","volume":"63","author":"J Wu","year":"2025","unstructured":"Wu, J., Wu, J., Zheng, Y., et al. (2025). MLGAT: multi-layer graph attention networks for multimodal emotion recognition in conversations. Journal of Intelligent Information System, 63(2), 375\u2013394. https:\/\/doi.org\/10.1007\/S10844-024-00879-4","journal-title":"Journal of Intelligent Information System"},{"key":"975_CR34","doi-asserted-by":"publisher","unstructured":"Yang, H., Gao, X., Wu, J., et\u00a0al. (2023). Self-adaptive context and modal-interaction modeling for multimodal emotion recognition. In: Rogers A, Boyd-Graber J, Okazaki N (eds) Findings of the Association for Computational Linguistics: ACL 2023, pp. 6267\u20136281. https:\/\/doi.org\/10.18653\/v1\/2023.findings-acl.390","DOI":"10.18653\/v1\/2023.findings-acl.390"},{"issue":"24","key":"975_CR35","doi-asserted-by":"publisher","first-page":"25642","DOI":"10.1609\/aaai.v39i24.34755","volume":"39","author":"Y Yang","year":"2025","unstructured":"Yang, Y., Dong, X., & Qiang, Y. (2025). Mse-adapter: A lightweight plugin endowing llms with the capability to perform multimodal sentiment analysis and emotion recognition. Proceedings of the AAAI Conference on Artificial Intelligence, 39(24), 25642\u201325650. https:\/\/doi.org\/10.1609\/aaai.v39i24.34755","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"975_CR36","doi-asserted-by":"publisher","unstructured":"Zhang, X., Li, Y. (2023). A cross-modality context fusion and semantic refinement network for emotion recognition in conversation. In: Rogers A, Boyd-Graber J, Okazaki N (eds) Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp 13099\u201313110. https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.732","DOI":"10.18653\/v1\/2023.acl-long.732"},{"key":"975_CR37","doi-asserted-by":"publisher","unstructured":"Zheng, X., Zhao, G., Zhu, L., et\u00a0al. (2022). Perd: Personalized emoji recommendation with dynamic user preference. In: Proceedings of the 45th international ACM SIGIR conference on research and development in information retrieval, pp 1922\u20131926. https:\/\/doi.org\/10.1145\/3477495.3531779","DOI":"10.1145\/3477495.3531779"},{"key":"975_CR38","doi-asserted-by":"publisher","unstructured":"Zhong, P., Wang, D., Miao, C. (2019). Knowledge-enriched transformer for emotion detection in textual conversations. In: Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP), pp. 165\u2013176. https:\/\/doi.org\/10.18653\/v1\/D19-1016","DOI":"10.18653\/v1\/D19-1016"},{"key":"975_CR39","doi-asserted-by":"publisher","unstructured":"Zou, S., Huang, X., Shen, X., et\u00a0al. (2022). Improving multimodal fusion with main modal transformer for emotion recognition in conversation. Knowledge-Based Systems 258:109978. https:\/\/doi.org\/10.1016\/j.knosys.2022.109978","DOI":"10.1016\/j.knosys.2022.109978"},{"key":"975_CR40","doi-asserted-by":"publisher","unstructured":"Zou, S., Huang, X., Shen, X. (2023). Multimodal prompt transformer with hybrid contrastive learning for emotion recognition in conversation. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 5994\u20136003, https:\/\/doi.org\/10.1145\/3581783.3611805","DOI":"10.1145\/3581783.3611805"}],"container-title":["Journal of Intelligent Information Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10844-025-00975-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10844-025-00975-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10844-025-00975-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,28]],"date-time":"2025-10-28T08:10:16Z","timestamp":1761639016000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10844-025-00975-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,4]]},"references-count":40,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["975"],"URL":"https:\/\/doi.org\/10.1007\/s10844-025-00975-z","relation":{},"ISSN":["0925-9902","1573-7675"],"issn-type":[{"value":"0925-9902","type":"print"},{"value":"1573-7675","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,4]]},"assertion":[{"value":"25 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 July 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 July 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 September 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors state that this research complies with ethical standards. This research does not involve either human participants or animals.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}},{"value":"The authors declare no competing interests.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"The datasets used during the current study are available. Additionally, the datasets generated, model settings, and training processes are available from the corresponding author upon reasonable request.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Availability of supporting data"}}]}}