{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T16:20:37Z","timestamp":1759940437949,"version":"3.38.0"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,7,10]],"date-time":"2024-07-10T00:00:00Z","timestamp":1720569600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,7,10]],"date-time":"2024-07-10T00:00:00Z","timestamp":1720569600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Intell Inf Syst"],"published-print":{"date-parts":[[2025,2]]},"DOI":"10.1007\/s10844-024-00858-9","type":"journal-article","created":{"date-parts":[[2024,7,10]],"date-time":"2024-07-10T09:04:42Z","timestamp":1720602282000},"page":"1-19","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Multi-task learning and mutual information maximization with crossmodal transformer for multimodal sentiment analysis"],"prefix":"10.1007","volume":"63","author":[{"given":"Yang","family":"Shi","sequence":"first","affiliation":[]},{"given":"Jinglang","family":"Cai","sequence":"additional","affiliation":[]},{"given":"Lei","family":"Liao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,7,10]]},"reference":[{"key":"858_CR1","doi-asserted-by":"publisher","unstructured":"Akhtar, M.S., Chauhan, D.S., Ghosal D., et\u00a0al. (2019). Multi-task learning for multi-modal emotion recognition and sentiment analysis. In: Proceedings of NAACL-HLT (pp. 370\u2013379). https:\/\/doi.org\/10.18653\/v1\/N19-1034","DOI":"10.18653\/v1\/N19-1034"},{"key":"858_CR2","unstructured":"Alemi, A.A., Fischer, I., Dillon, J.V., et\u00a0al. (2016). Deep variational information bottleneck. https:\/\/doi.org\/10.48550\/arXiv.1612.00410"},{"issue":"1","key":"858_CR3","doi-asserted-by":"publisher","first-page":"157","DOI":"10.1007\/s10844-022-00745-1","volume":"60","author":"A Borah","year":"2023","unstructured":"Borah, A. (2023). Detecting covid-19 vaccine hesitancy in india: a multimodal transformer based approach. Journal of Intelligent Information Systems, 60(1), 157\u2013173. https:\/\/doi.org\/10.1007\/s10844-022-00745-1","journal-title":"Journal of Intelligent Information Systems"},{"key":"858_CR4","unstructured":"Chen, F., Luo, Z., Xu, Y., et\u00a0al. (2019). Complementary fusion of multi-features and multi-modalities in sentiment analysis. https:\/\/doi.org\/10.48550\/arXiv.1904.08138"},{"key":"858_CR5","doi-asserted-by":"publisher","unstructured":"Degottex, G., Kane, J., Drugman, T., et\u00a0al. (2014). Covarep\u2013a collaborative voice analysis repository for speech technologies. In: 2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (pp. 960\u2013964). https:\/\/doi.org\/10.1109\/ICASSP.2014.6853739","DOI":"10.1109\/ICASSP.2014.6853739"},{"key":"858_CR6","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., et\u00a0al. (2019). Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers) (pp. 4171\u20134186). https:\/\/doi.org\/10.18653\/v1\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"858_CR7","volume-title":"What the Face Reveals: Basic and Applied Studies of Spontaneous Expression Using the Facial Action Coding System (FACS)","author":"P Ekman","year":"1997","unstructured":"Ekman, P., & Rosenberg, E. L. (1997). What the Face Reveals: Basic and Applied Studies of Spontaneous Expression Using the Facial Action Coding System (FACS). USA: Oxford University Press."},{"key":"858_CR8","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109133","volume":"134","author":"J Fan","year":"2023","unstructured":"Fan, J., Yu, Y., Huang, L., et al. (2023). Graphdpi: Partial label disambiguation by graph representation learning via mutual information maximization. Pattern Recognition, 134, 109133. https:\/\/doi.org\/10.1016\/j.patcog.2022.109133","journal-title":"Pattern Recognition"},{"key":"858_CR9","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1016\/j.neucom.2022.11.037","volume":"519","author":"A Fazekas","year":"2023","unstructured":"Fazekas, A., & Kov\u00e1cs, G. (2023). Optimal binning for a variance based alternative of mutual information in pattern recognition. Neurocomputing, 519, 135\u2013147. https:\/\/doi.org\/10.1016\/j.neucom.2022.11.037","journal-title":"Neurocomputing"},{"key":"858_CR10","doi-asserted-by":"publisher","first-page":"184","DOI":"10.1016\/j.inffus.2020.09.005","volume":"66","author":"D Gkoumas","year":"2021","unstructured":"Gkoumas, D., Li, Q., Lioma, C., et al. (2021). What makes the difference? an empirical comparison of fusion strategies for multimodal language analysis. Information Fusion, 66, 184\u2013197. https:\/\/doi.org\/10.1016\/j.inffus.2020.09.005","journal-title":"Information Fusion"},{"key":"858_CR11","doi-asserted-by":"publisher","unstructured":"Han, W., Chen, H., Gelbukh, A., et\u00a0al. (2021a). Bi-bimodal modality fusion for correlation-controlled multimodal sentiment analysis. In: Proceedings of the 2021 International Conference on Multimodal Interaction (pp. 6\u201315). https:\/\/doi.org\/10.1145\/3462244.3479919","DOI":"10.1145\/3462244.3479919"},{"key":"858_CR12","doi-asserted-by":"publisher","unstructured":"Han, W., Chen, H., Poria, S. (2021b). Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing (pp. 9180\u20139192). https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.723","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"858_CR13","doi-asserted-by":"publisher","unstructured":"Hazarika, D., Zimmermann, R., Poria, S. (2020). Misa: Modality-invariant and-specific representations for multimodal sentiment analysis. In: Proceedings of the 28th ACM International Conference on Multimedia (pp. 1122\u20131131). https:\/\/doi.org\/10.1145\/3394171.3413678","DOI":"10.1145\/3394171.3413678"},{"issue":"9","key":"858_CR14","doi-asserted-by":"publisher","first-page":"6289","DOI":"10.1109\/TNNLS.2021.3135420","volume":"34","author":"T Hoang","year":"2023","unstructured":"Hoang, T., Do, T. T., Nguyen, T. V., et al. (2023). Multimodal mutual information maximization: a novel approach for unsupervised deep cross-modal hashing. IEEE Transactions on Neural Networks and Learning Systems, 34(9), 6289\u20136302. https:\/\/doi.org\/10.1109\/TNNLS.2021.3135420","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"issue":"3","key":"858_CR15","doi-asserted-by":"publisher","first-page":"673","DOI":"10.1007\/s10844-023-00789-x","volume":"61","author":"R Kumari","year":"2023","unstructured":"Kumari, R., Ashok, N., Agrawal, P. K., et al. (2023). Identifying multimodal misinformation leveraging novelty detection and emotion recognition. Journal of Intelligent Information Systems, 61(3), 673\u2013694. https:\/\/doi.org\/10.1007\/s10844-023-00789-x","journal-title":"Journal of Intelligent Information Systems"},{"key":"858_CR16","doi-asserted-by":"publisher","unstructured":"Liu, Z., Feng, R., Chen, H., et\u00a0al. (2022). Temporal feature alignment and mutual information maximization for video-based human pose estimation. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 10996\u201311006. https:\/\/doi.org\/10.1109\/CVPR52688.2022.01073","DOI":"10.1109\/CVPR52688.2022.01073"},{"key":"858_CR17","doi-asserted-by":"publisher","unstructured":"Liu, Z., Shen, Y., Lakshminarasimhan, V.B., et\u00a0al. (2018). Efficient low-rank multimodal fusion with modality-specific factors. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 2247\u20132256). https:\/\/doi.org\/10.18653\/v1\/P18-1209","DOI":"10.18653\/v1\/P18-1209"},{"issue":"3","key":"858_CR18","doi-asserted-by":"publisher","first-page":"2276","DOI":"10.1109\/TAFFC.2022.3172360","volume":"14","author":"S Mai","year":"2023","unstructured":"Mai, S., Zeng, Y., Zheng, S., et al. (2023). Hybrid contrastive learning of tri-modal representation for multimodal sentiment analysis. IEEE Transactions on Affective Computing, 14(3), 2276\u20132289. https:\/\/doi.org\/10.1109\/TAFFC.2022.3172360","journal-title":"IEEE Transactions on Affective Computing"},{"key":"858_CR19","doi-asserted-by":"publisher","unstructured":"Mavromatis, C. & Karypis, G. (2021) Graph infoclust: Maximizing coarse-grain mutual information in graphs. In: Advances in Knowledge Discovery and Data Mining (pp. 541\u2013553). https:\/\/doi.org\/10.1007\/978-3-030-75762-5_43","DOI":"10.1007\/978-3-030-75762-5_43"},{"key":"858_CR20","unstructured":"Oord, A.V.D., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. https:\/\/doi.org\/10.48550\/arXiv.1807.03748"},{"key":"858_CR21","doi-asserted-by":"publisher","unstructured":"Peng, Z., Huang, W., Luo, M., et\u00a0al. (2020). Graph representation learning via graphical mutual information maximization. In: Proceedings of The Web Conference 2020 (pp. 259\u2013270). https:\/\/doi.org\/10.1145\/3366423.3380112","DOI":"10.1145\/3366423.3380112"},{"key":"858_CR22","doi-asserted-by":"crossref","unstructured":"Rahman, W., Hasan, M.K., Lee, S., et\u00a0al. (2020). Integrating multimodal information in large pretrained transformers. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (pp. 2359\u20132369). 10.18653\/v1\/2020.acl-main.214","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"858_CR23","doi-asserted-by":"publisher","unstructured":"Sun, Z., Sarma, P., Sethares, W., etal. (2020). Learning relationships between text, audio, and video via deep canonical correlation for multimodal language analysis. In: Proceedings of the AAAI Conference on Artificial Intelligence (pp. 8992\u20138999). https:\/\/doi.org\/10.1609\/aaai.v34i05.6431","DOI":"10.1609\/aaai.v34i05.6431"},{"issue":"1","key":"858_CR24","doi-asserted-by":"publisher","first-page":"309","DOI":"10.1109\/TAFFC.2023.3274829","volume":"15","author":"L Sun","year":"2024","unstructured":"Sun, L., Lian, Z., Liu, B., et al. (2024). Efficient multimodal transformer with dual-level feature restoration for robust multimodal sentiment analysis. IEEE Transactions on Affective Computing, 15(1), 309\u2013325. https:\/\/doi.org\/10.1109\/TAFFC.2023.3274829","journal-title":"IEEE Transactions on Affective Computing"},{"key":"858_CR25","doi-asserted-by":"publisher","unstructured":"Tsai, Y.H.H., Bai, S., Liang, P.P., et\u00a0al. (2019a). Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (pp. 6558\u20136569). https:\/\/doi.org\/10.18653\/v1\/p19-1656","DOI":"10.18653\/v1\/p19-1656"},{"key":"858_CR26","doi-asserted-by":"publisher","unstructured":"Tsai, Y.H.H., Liang, P.P., Zadeh, A., et\u00a0al. (2019b). Learning factorized multimodal representations. In: International Conference on Representation Learning. https:\/\/doi.org\/10.48550\/arXiv.1806.06176","DOI":"10.48550\/arXiv.1806.06176"},{"issue":"2","key":"858_CR27","doi-asserted-by":"publisher","first-page":"367","DOI":"10.1007\/s10844-022-00764-y","volume":"61","author":"SK Uppada","year":"2023","unstructured":"Uppada, S. K., & Patel, P. (2023). An image and text-based multimodal model for detecting fake news in osn\u2019s. Journal of Intelligent Information Systems, 61(2), 367\u2013393. https:\/\/doi.org\/10.1007\/s10844-022-00764-y","journal-title":"Journal of Intelligent Information Systems"},{"key":"858_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109237","volume":"137","author":"Y Wang","year":"2023","unstructured":"Wang, Y., Chang, D., Fu, Z., et al. (2023). Learning a bi-directional discriminative representation for deep clustering. Pattern Recognition, 137, 109237. https:\/\/doi.org\/10.1016\/j.patcog.2022.109237","journal-title":"Pattern Recognition"},{"key":"858_CR29","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109259","volume":"136","author":"D Wang","year":"2023","unstructured":"Wang, D., Guo, X., Tian, Y., et al. (2023). Tetfn: A text enhanced transformer fusion network for multimodal sentiment analysis. Pattern Recognition, 136, 109259. https:\/\/doi.org\/10.1016\/j.patcog.2022.109259","journal-title":"Pattern Recognition"},{"key":"858_CR30","doi-asserted-by":"publisher","first-page":"109273","DOI":"10.1016\/j.patcog.2022.109273","volume":"137","author":"Y Wang","year":"2023","unstructured":"Wang, Y., Pang, W., & Jiao, Z. (2023). An adaptive mutual k-nearest neighbors clustering algorithm based on maximizing mutual information. Pattern Recognition, 137, 109273. https:\/\/doi.org\/10.1016\/j.patcog.2022.109273","journal-title":"Pattern Recognition"},{"key":"858_CR31","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1016\/j.ins.2023.01.116","volume":"628","author":"J Wang","year":"2023","unstructured":"Wang, J., Wang, S., Lin, M., et al. (2023). Learning speaker-independent multimodal representation for sentiment analysis. Information Sciences, 628, 208\u2013225. https:\/\/doi.org\/10.1016\/j.ins.2023.01.116","journal-title":"Information Sciences"},{"issue":"3","key":"858_CR32","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1109\/TAFFC.2017.2762299","volume":"9","author":"XS Wei","year":"2017","unstructured":"Wei, X. S., Zhang, C. L., Zhang, H., et al. (2017). Deep bimodal regression of apparent personality traits from short video sequences. IEEE Transactions on Affective Computing, 9(3), 303\u2013315. https:\/\/doi.org\/10.1109\/TAFFC.2017.2762299","journal-title":"IEEE Transactions on Affective Computing"},{"key":"858_CR33","doi-asserted-by":"publisher","first-page":"107676","DOI":"10.1016\/j.knosys.2021.107676","volume":"235","author":"T Wu","year":"2022","unstructured":"Wu, T., Peng, J., Zhang, W., et al. (2022). Video sentiment analysis with bimodal information-augmented multi-head attention. Knowledge-Based Systems, 235, 107676. https:\/\/doi.org\/10.1016\/j.knosys.2021.107676","journal-title":"Knowledge-Based Systems"},{"key":"858_CR34","doi-asserted-by":"publisher","unstructured":"Yang, K., Xu, H\/, Gao, K\/ (2020). Cm-bert: Cross-modal bert for text-audio sentiment analysis. In: Proceedings of the 28th ACM International Conference on Multimedia (pp. 521\u2013528). https:\/\/doi.org\/10.1145\/3394171.3413690","DOI":"10.1145\/3394171.3413690"},{"key":"858_CR35","doi-asserted-by":"publisher","first-page":"108488","DOI":"10.1016\/j.knosys.2022.108488","volume":"243","author":"P Yang","year":"2022","unstructured":"Yang, P., Ge, Y., Yao, Y., et al. (2022). Gcn-based document representation for keyphrase generation enhanced by maximizing mutual information. Knowledge-Based Systems, 243, 108488. https:\/\/doi.org\/10.1016\/j.knosys.2022.108488","journal-title":"Knowledge-Based Systems"},{"key":"858_CR36","doi-asserted-by":"publisher","unstructured":"Yu, W., Xu, H., Meng, F., et\u00a0al. (2020). Ch-sims: A chinese multimodal sentiment analysis dataset with fine-grained annotation of modality. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (pp. 3718\u20133727). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.343","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"858_CR37","doi-asserted-by":"publisher","unstructured":"Yu, W., Xu, H., Yuan, Z., et\u00a0al. (2021). Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. In: Proceedings of the AAAI Conference on Artificial Intelligence (pp. 10790\u201310797). https:\/\/doi.org\/10.1609\/aaai.v35i12.17289","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"858_CR38","doi-asserted-by":"publisher","unstructured":"Zadeh, A., Chen, M., Poria, S., et\u00a0al. (2017). Tensor fusion network for multimodal sentiment analysis. In: Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing (pp. 1103\u20131114). https:\/\/doi.org\/10.18653\/v1\/D17-1115","DOI":"10.18653\/v1\/D17-1115"},{"key":"858_CR39","doi-asserted-by":"publisher","unstructured":"Zadeh, A.B., Liang, P.P., Poria, S., et\u00a0al. (2018). Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 2236\u20132246). https:\/\/doi.org\/10.18653\/v1\/P18-1208","DOI":"10.18653\/v1\/P18-1208"},{"issue":"6","key":"858_CR40","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MIS.2016.94","volume":"31","author":"A Zadeh","year":"2016","unstructured":"Zadeh, A., Zellers, R., Pincus, E., et al. (2016). Multimodal sentiment intensity analysis in videos: Facial gestures and verbal messages. IEEE Intelligent Systems, 31(6), 82\u201388. https:\/\/doi.org\/10.1109\/MIS.2016.94","journal-title":"IEEE Intelligent Systems"}],"container-title":["Journal of Intelligent Information Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10844-024-00858-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10844-024-00858-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10844-024-00858-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,8]],"date-time":"2025-03-08T10:23:08Z","timestamp":1741429388000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10844-024-00858-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,10]]},"references-count":40,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2025,2]]}},"alternative-id":["858"],"URL":"https:\/\/doi.org\/10.1007\/s10844-024-00858-9","relation":{},"ISSN":["0925-9902","1573-7675"],"issn-type":[{"type":"print","value":"0925-9902"},{"type":"electronic","value":"1573-7675"}],"subject":[],"published":{"date-parts":[[2024,7,10]]},"assertion":[{"value":"7 December 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 March 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 March 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 July 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"Not Applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}