{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T05:17:33Z","timestamp":1775193453114,"version":"3.50.1"},"reference-count":64,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2024,12,22]],"date-time":"2024-12-22T00:00:00Z","timestamp":1734825600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,22]],"date-time":"2024-12-22T00:00:00Z","timestamp":1734825600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076227"],"award-info":[{"award-number":["62076227"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1007\/s11263-024-02304-3","type":"journal-article","created":{"date-parts":[[2024,12,22]],"date-time":"2024-12-22T15:00:04Z","timestamp":1734879604000},"page":"3020-3040","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["Noise-Resistant Multimodal Transformer for Emotion Recognition"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1913-0089","authenticated-orcid":false,"given":"Yuanyuan","family":"Liu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8570-5792","authenticated-orcid":false,"given":"Haoyu","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3180-0484","authenticated-orcid":false,"given":"Yibing","family":"Zhan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2518-9977","authenticated-orcid":false,"given":"Zijing","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5036-6439","authenticated-orcid":false,"given":"Guanghao","family":"Yin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5567-2735","authenticated-orcid":false,"given":"Lin","family":"Wei","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5004-8975","authenticated-orcid":false,"given":"Zhe","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,22]]},"reference":[{"key":"2304_CR1","unstructured":"Akbari, H., Yuan, L., Qian, R., Chuang, W., Chang, S., Cui, Y., & Gong, B. (2021). VATT: Transformers for multimodal self-supervised learning from raw video, audio and text. In Advances in neural information processing systems 34: Annual conference on neural information processing systems 2021, neurips 2021, December 6\u201314, 2021, virtual (pp. 24206\u201324221)."},{"key":"2304_CR2","doi-asserted-by":"crossref","unstructured":"Baltrusaitis, T., Robinson, P., & Morency, L. (2016). Openface: An open source facial behavior analysis toolkit. In 2016 IEEE winter conference on applications of computer vision (wacv) (pp. 1\u201310). IEEE Computer Society.","DOI":"10.1109\/WACV.2016.7477553"},{"key":"2304_CR3","doi-asserted-by":"crossref","unstructured":"Beale, R., & Peter, C. (2008). The role of affect and emotion in HCI. In Affect and emotion in human\u2013computer interaction (Vol. 4868, pp. 1\u201311). Springer.","DOI":"10.1007\/978-3-540-85099-1_1"},{"key":"2304_CR4","unstructured":"Bousmalis, K., Trigeorgis, G., Silberman, N., Krishnan, D., & Erhan, D. (2016). Domain separation networks. In Lee, D., Sugiyama, M., Luxburg, U., Guyon, I., & Garnett, R. (Eds.), Advances in neural information processing systems (Vol.\u00a029). Curran Associates, Inc."},{"issue":"4","key":"2304_CR5","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso, C., Bulut, M., Lee, C., Kazemzadeh, A., Mower, E., Kim, S., & Narayanan, S. S. (2008). IEMOCAP: Interactive emotional dyadic motion capture database. Language Resources and Evaluation, 42(4), 335\u2013359.","journal-title":"Language Resources and Evaluation"},{"key":"2304_CR6","doi-asserted-by":"crossref","unstructured":"Chauhan, D. S., Akhtar, M. S., Ekbal, A., & Bhattacharyya, P. (2019). Context-aware interactive attention for multi-modal sentiment and emotion analysis. In Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing, EMNLP-IJCNLP 2019, Hong Kong, China, November 3\u20137, 2019 (pp. 5646\u20135656). Association for Computational Linguistics.","DOI":"10.18653\/v1\/D19-1566"},{"key":"2304_CR7","doi-asserted-by":"crossref","unstructured":"Cornejo, J. Y. R., & Pedrini, H. (2019). Audio-visual emotion recognition using a hybrid deep convolutional neural network based on census transform. In 2019 IEEE international conference on systems, man and cybernetics (pp. 3396\u20133402). IEEE.","DOI":"10.1109\/SMC.2019.8914193"},{"key":"2304_CR8","doi-asserted-by":"crossref","unstructured":"Dai, W., Cahyawijaya, S., Liu, Z., & Fung, P. (2021). Multimodal end-to-end sparse model for emotion recognition. In Proceedings of the 2021 conference of the north American chapter of the association for computational linguistics: Human language technologies, NAACL-HLT 2021, online, June 6\u201311, 2021 (pp. 5305\u20135316). Association for Computational Linguistics.","DOI":"10.18653\/v1\/2021.naacl-main.417"},{"key":"2304_CR9","unstructured":"Ding, L., Wang, L., Liu, X., Wong, D. F., Tao, D., & Tu, Z. (2021). Understanding and improving lexical choice in non-autoregressive translation. In International conference on learning representations (iclr)."},{"issue":"3","key":"2304_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2682899","volume":"47","author":"SK D\u2019Mello","year":"2015","unstructured":"D\u2019Mello, S. K., & Kory, J. M. (2015). A review and meta-analysis of multimodal affect detection systems. ACM Computing Surveys (CsUR), 47(3), 1\u201336.","journal-title":"ACM Computing Surveys (CsUR)"},{"key":"2304_CR11","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., & Houlsby, N. (2021). An image is worth 16x16 words: Transformers for image recognition at scale. In ICLR."},{"key":"2304_CR12","doi-asserted-by":"crossref","unstructured":"El-Madany, N. E., He, Y., & Guan, L. (2016). Multiview emotion recognition via multi-set locality preserving canonical correlation analysis. In IEEE international symposium on circuits and systems(iscas) (pp. 590\u2013593). IEEE.","DOI":"10.1109\/ISCAS.2016.7527309"},{"key":"2304_CR13","doi-asserted-by":"crossref","unstructured":"Franceschini, R., Fini, E., Beyan, C., Conti, A., Arrigoni, F., & Ricci, E. (2022). Multimodal emotion recognition with modality-pairwise unsupervised contrastive loss. In 26th international conference on pattern recognition, ICPR 2022, Montreal, QC, Canada, August 21\u201325, 2022 (pp. 2589\u20132596). IEEE.","DOI":"10.1109\/ICPR56361.2022.9956589"},{"key":"2304_CR14","unstructured":"Ganin, Y., & Lempitsky, V. S. (2015). Unsupervised domain adaptation by backpropagation. In International conference on machine learning (Vol.\u00a037, pp. 1180\u20131189). JMLR."},{"key":"2304_CR15","unstructured":"Goodfellow, I. J., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., & Bengio, Y. (2014). Generative adversarial nets. In Advances in neural information processing systems (Vol.\u00a027, pp. 2672\u20132680)."},{"key":"2304_CR16","doi-asserted-by":"crossref","unstructured":"Hazarika, D., Zimmermann, R., & Poria, S. (2020). MISA: Modality-invariant and -specific representations for multimodal sentiment analysis. In Proceedings of the 28th ACM international conference on multimedia (pp. 1122\u20131131). ACM.","DOI":"10.1145\/3394171.3413678"},{"key":"2304_CR17","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In 2016 IEEE conference on computer vision and pattern recognition (cvpr) (pp. 770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"key":"2304_CR18","doi-asserted-by":"crossref","unstructured":"He, Z., Zhang, L., Gao, X., & Zhang, D. (2022). Multi-adversarial faster-rcnn with paradigm teacher for unrestricted object detection. International Journal of Computer Vision, 1\u201321.","DOI":"10.1007\/s11263-022-01728-z"},{"key":"2304_CR19","doi-asserted-by":"crossref","unstructured":"Huang, J., Tao, J., Liu, B., Lian, Z., & Niu, M. (2020). Multimodal transformer fusion for continuous emotion recognition. In 2020 IEEE international conference on acoustics, speech and signal processing (icassp) (pp. 3507\u20133511). IEEE.","DOI":"10.1109\/ICASSP40776.2020.9053762"},{"key":"2304_CR20","unstructured":"Kenton, J. D. M-W. C., & Toutanova, L. K. (2019). Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of naacl-hlt (pp. 4171\u20134186)."},{"key":"2304_CR21","unstructured":"Kim, W., Son, B., & Kim, I. (2021). Vilt: Vision-and-language transformer without convolution or region supervision. In Proceedings of the 38th international conference on machine learning, ICML 2021, 18\u201324 July 2021, virtual event (Vol.\u00a0139, pp. 5583\u20135594). PMLR."},{"key":"2304_CR22","doi-asserted-by":"crossref","unstructured":"Li, C., Deng, C., Li, N., Liu, W., Gao, X., & Tao, D. (2018). Self-supervised adversarial hashing networks for cross-modal retrieval. In 2018 IEEE\/cvf conference on computer vision and pattern recognition (pp. 4242\u20134251).","DOI":"10.1109\/CVPR.2018.00446"},{"key":"2304_CR23","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2024.102367","volume":"108","author":"Z Lian","year":"2024","unstructured":"Lian, Z., Sun, L., Sun, H., Chen, K., Wen, Z., Gu, H., & Tao, J. (2024). Gpt-4v with emotion: A zero-shot benchmark for generalized emotion recognition. Information Fusion, 108, 102367.","journal-title":"Information Fusion"},{"key":"2304_CR24","doi-asserted-by":"crossref","unstructured":"Liang, J., Li, R., & Jin, Q. (2020). Semi-supervised multi-modal emotion recognition with cross-modal distribution matching. In Proceedings of the 28th ACM international conference on multimedia (pp. 2852\u20132861). ACM.","DOI":"10.1145\/3394171.3413579"},{"key":"2304_CR25","doi-asserted-by":"crossref","unstructured":"Liu, Y., Dai, W., Feng, C., Wang, W., Yin, G., Zeng, J., & Shan, S. (2022). MAFW: A large-scale, multi-modal, compound affective database for dynamic facial expression recognition in the wild (pp. 24\u201332). ACM.","DOI":"10.1145\/3503161.3548190"},{"key":"2304_CR26","doi-asserted-by":"publisher","DOI":"10.1016\/J.PATCOG.2023.109368","volume":"138","author":"Y Liu","year":"2023","unstructured":"Liu, Y., Wang, W., Feng, C., Zhang, H., Chen, Z., & Zhan, Y. (2023). Expression snippet transformer for robust video-based facial expression recognition. Pattern Recognition, 138, 109368. https:\/\/doi.org\/10.1016\/J.PATCOG.2023.109368","journal-title":"Pattern Recognition"},{"key":"2304_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Y., Wang, W., Zhan, Y., Feng, S., Liu, K., & Chen, Z. (2023). Pose-disentangled contrastive learning for self-supervised facial representation. In Proceedings of the IEEE\/C conference on computer vision and pattern recognition (cvpr) (pp. 9717\u20139728).","DOI":"10.1109\/CVPR52729.2023.00937"},{"key":"2304_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Z., Shen, Y., Lakshminarasimhan, V. B., Liang, P. P., Zadeh, A., & Morency, L. (2018). Efficient low-rank multimodal fusion with modality-specific factors. In Proceedings of the 56th annual meeting of the association for computational linguistics, ACL 2018, Melbourne, Australia, July 15\u201320, 2018, volume 1: Long papers (pp. 2247\u20132256). Association for Computational Linguistics.","DOI":"10.18653\/v1\/P18-1209"},{"key":"2304_CR29","unstructured":"Luo, H., Ji, L., Huang, Y., Wang, B., Ji, S., & Li, T. (2021). Scalevlad: Improving multimodal sentiment analysis via multi-scale fusion of locally descriptors. arXiv:2112.01368"},{"key":"2304_CR30","doi-asserted-by":"crossref","unstructured":"Lv, F., Chen, X., Huang, Y., Duan, L., & Lin, G. (2021). Progressive modality reinforcement for human multimodal emotion recognition from unaligned multimodal sequences. In 2021 IEEE\/CVF conference on computer vision and pattern recognition (cvpr) (pp. 2554\u20132562). Computer Vision Foundation\/IEEE.","DOI":"10.1109\/CVPR46437.2021.00258"},{"key":"2304_CR31","doi-asserted-by":"publisher","first-page":"184","DOI":"10.1016\/j.inffus.2018.06.003","volume":"46","author":"Y Ma","year":"2019","unstructured":"Ma, Y., Hao, Y., Chen, M., Chen, J., Lu, P., & Kosir, A. (2019). Audio-visual emotion fusion (AVEF): A deep efficient weighted approach. Information Fusion, 46, 184\u2013192.","journal-title":"Information Fusion"},{"key":"2304_CR32","unstructured":"Maas, A. L., Hannun, A. Y., & Ng, A. Y., et\u00a0al. (2013). Rectifier nonlinearities improve neural network acoustic models. In Proceedings of the ICML (Vol.\u00a030, p. 3)."},{"key":"2304_CR33","doi-asserted-by":"crossref","unstructured":"Mao, H., Yuan, Z., Xu, H., Yu, W., Liu, Y., & Gao, K. (2022). M-SENA: An integrated platform for multimodal sentiment analysis. Proceedings of the 60th annual meeting of the association for computational linguistics (pp. 204\u2013213). Association for Computational Linguistics.","DOI":"10.18653\/v1\/2022.acl-demo.20"},{"key":"2304_CR34","doi-asserted-by":"crossref","unstructured":"Mittal, T., Bhattacharya, U., Chandra, R., Bera, A., & Manocha, D. (2020a). M3er: Multiplicative multimodal emotion recognition using facial, textual, and speech cues. In Proceedings of the AAAI conference on artificial intelligence (Vol.\u00a034, pp. 1359\u20131367).","DOI":"10.1609\/aaai.v34i02.5492"},{"key":"2304_CR35","doi-asserted-by":"crossref","unstructured":"Mittal, T., Bhattacharya, U., Chandra, R., Bera, A., & Manocha, D. (2020b). M3ER: multiplicative multimodal emotion recognition using facial, textual, and speech cues. In The thirty-fourth AAAI conference on artificial intelligence, AAAI 2020, the thirty-second innovative applications of artificial intelligence conference, IAAI 2020, the tenth AAAI symposium on educational advances in artificial intelligence, EAAI 2020, New York, February 7\u201312, 2020 (pp. 1359\u20131367). AAAI Press.","DOI":"10.1609\/aaai.v34i02.5492"},{"key":"2304_CR36","doi-asserted-by":"crossref","unstructured":"Niu, X., Yu, Z., Han, H., Li, X., Shan, S., & Zhao, G. (2020). Video-based remote physiological measurement via cross-verified feature disentangling. In Computer vision-ECCV 2020\u201416th European conference, Glasgow, August 23\u201328, 2020, proceedings, part II (Vol. 12347, pp. 295\u2013310). Springer.","DOI":"10.1007\/978-3-030-58536-5_18"},{"key":"2304_CR37","doi-asserted-by":"crossref","unstructured":"Pei, Z., Cao, Z., Long, M., & Wang, J. (2018). Multi-adversarial domain adaptation. In Proceedings of the AAAI conference on artificial intelligence (Vol.\u00a032, pp. 3934\u20133941). AAAI Press.","DOI":"10.1609\/aaai.v32i1.11767"},{"key":"2304_CR38","doi-asserted-by":"crossref","unstructured":"Pham, H., Liang, P. P., Manzini, T., Morency, L., & P\u00f3czos, B. (2019). Found in translation: Learning robust joint representations by cyclic translations between modalities. In Proceedings of the AAAI conference on artificial intelligence (Vol.\u00a033, pp. 6892\u20136899). AAAI Press.","DOI":"10.1609\/aaai.v33i01.33016892"},{"key":"2304_CR39","doi-asserted-by":"publisher","first-page":"141","DOI":"10.1016\/j.inffus.2018.06.004","volume":"46","author":"Y Qian","year":"2019","unstructured":"Qian, Y., Zhang, Y., Ma, X., Yu, H., & Peng, L. (2019). EARS: Emotion-aware recommender system based on hybrid information fusion. Information Fusion, 46, 141\u2013146.","journal-title":"Information Fusion"},{"key":"2304_CR40","doi-asserted-by":"crossref","unstructured":"Sahay, S., Okur, E., Kumar, S.H., & Nachman, L. (2020). Low rank fusion based transformers for multimodal sequences. arXiv:2007.02038","DOI":"10.18653\/v1\/2020.challengehml-1.4"},{"issue":"2","key":"2304_CR41","first-page":"176","volume":"12","author":"L Shen","year":"2009","unstructured":"Shen, L., Wang, M., & Shen, R. (2009). Affective e-learning: Using \u201cemotional\u2019\u2019 data to improve learning in pervasive learning environment. Journal of Educational Technology and Society, 12(2), 176\u2013189.","journal-title":"Journal of Educational Technology and Society"},{"key":"2304_CR42","doi-asserted-by":"crossref","unstructured":"Sun, Z., Sarma, P.K., Sethares, W.A., & Liang, Y. (2020). Learning relationships between text, audio, and video via deep canonical correlation for multimodal language analysis. In Proceedings of the AAAI conference on artificial intelligence (Vol.\u00a034, pp. 8992\u20138999). AAAI Press.","DOI":"10.1609\/aaai.v34i05.6431"},{"key":"2304_CR43","doi-asserted-by":"crossref","unstructured":"Tsai, Y. H., Bai, S., Liang, P. P., Kolter, J. Z., Morency, L., & Salakhutdinov, R. (2019). Multimodal transformer for unaligned multimodal language sequences. In Proceedings of the 57th conference of the association for computational linguistics, ACL 2019, Florence, July 28\u2013August 2, 2019, volume 1: Long papers (pp. 6558\u20136569). Association for Computational Linguistics.","DOI":"10.18653\/v1\/P19-1656"},{"key":"2304_CR44","unstructured":"Tsai, Y. H., Liang, P. P., Zadeh, A., Morency, L., & Salakhutdinov, R. (2019). Learning factorized multimodal representations. In ICLR."},{"key":"2304_CR45","unstructured":"Van\u00a0der Maaten, L., & Hinton, G. (2008). Visualizing data using t-sne. In Journal of machine learning research, 9(11)."},{"key":"2304_CR46","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., & Polosukhin, I. (2017). Attention is all you need. In Advances in neural information processing systems (Vol.\u00a030, pp. 5998\u20136008)."},{"key":"2304_CR47","doi-asserted-by":"crossref","unstructured":"Wang, B., Yang, Y., Xu, X., Hanjalic, A., & Shen, H. T. (2017). Adversarial cross-modal retrieval. In Proceedings of the 25th ACM international conference on multimedia (pp. 154\u2013162). ACM.","DOI":"10.1145\/3123266.3123326"},{"key":"2304_CR48","doi-asserted-by":"crossref","unstructured":"Wang, Y., Chen, Z., Chen, S., & Zhu, Y. (2022). MT-TCCT: Multi-task learning for multimodal emotion recognition. In Artificial neural networks and machine learning-ICANN 2022\u201431st international conference on artificial neural networks, Bristol, September 6\u20139, 2022, proceedings, part III (Vol. 13531, pp. 429\u2013442). Springer.","DOI":"10.1007\/978-3-031-15934-3_36"},{"issue":"5","key":"2304_CR49","doi-asserted-by":"publisher","first-page":"936","DOI":"10.1109\/TMM.2008.927665","volume":"10","author":"Y Wang","year":"2008","unstructured":"Wang, Y., & Guan, L. (2008). Recognizing human emotional state from audiovisual signals. IEEE Transactions on Multimedia, 10(5), 936\u2013946.","journal-title":"IEEE Transactions on Multimedia"},{"issue":"12","key":"2304_CR50","doi-asserted-by":"publisher","first-page":"2849","DOI":"10.1007\/s11263-020-01340-z","volume":"128","author":"Y Wang","year":"2020","unstructured":"Wang, Y., Herranz, L., & van de Weijer, J. (2020). Mix and match networks: Cross-modal alignment for zero-pair image-to-image translation. International Journal of Computer Vision, 128(12), 2849\u20132872.","journal-title":"International Journal of Computer Vision"},{"key":"2304_CR51","doi-asserted-by":"crossref","unstructured":"Wang, Y., Shen, Y., Liu, Z., Liang, P. P., Zadeh, A., & Morency, L. (2019). Words can shift: Dynamically adjusting word representations using nonverbal behaviors. In Proceedings of the AAAI conference on artificial intelligence (Vol.\u00a033, pp. 7216\u20137223). AAAI Press.","DOI":"10.1609\/aaai.v33i01.33017216"},{"key":"2304_CR52","doi-asserted-by":"crossref","unstructured":"Yang, D., Huang, S., Kuang, H., Du, Y., & Zhang, L. (2022). Disentangled representation learning for multimodal emotion recognition. In Proceedings of the 30th ACM international conference on multimedia (pp. 1642\u20131651). ACM.","DOI":"10.1145\/3503161.3547754"},{"key":"2304_CR53","doi-asserted-by":"crossref","unstructured":"Yu, W., Xu, H., Meng, F., Zhu, Y., Ma, Y., Wu, J., & Yang, K. (2020). CH-SIMS: A Chinese multimodal sentiment analysis dataset with fine-grained annotation of modality. In Proceedings of the 58th annual meeting of the association for computational linguistics (pp. 3718\u20133727). Association for Computational Linguistics.","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"2304_CR54","doi-asserted-by":"crossref","unstructured":"Yu, W., Xu, H., Yuan, Z., & Wu, J. (2021). Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. In Proceedings of the AAAI conference on artificial intelligence (Vol.\u00a035, pp. 10790\u201310797).","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"2304_CR55","doi-asserted-by":"crossref","unstructured":"Yuan, Z., Li, W., Xu, H., & Yu, W. (2021). Transformer-based feature reconstruction network for robust multimodal sentiment analysis. In Proceedings of the 29th ACM international conference on multimedia (pp. 4400\u20134407). ACM.","DOI":"10.1145\/3474085.3475585"},{"key":"2304_CR56","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Chen, M., Poria, S., Cambria, E., & Morency, L. (2017). Tensor fusion network for multimodal sentiment analysis. In Proceedings of the 2017 conference on empirical methods in natural language processing (pp. 1103\u20131114). Association for Computational Linguistics.","DOI":"10.18653\/v1\/D17-1115"},{"key":"2304_CR57","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Liang, P. P., Poria, S., Cambria, E., & Morency, L. (2018). Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph. In Proceedings of the 56th annual meeting of the association for computational linguistics (pp. 2236\u20132246).","DOI":"10.18653\/v1\/P18-1208"},{"issue":"6","key":"2304_CR58","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MIS.2016.94","volume":"31","author":"A Zadeh","year":"2016","unstructured":"Zadeh, A., Zellers, R., Pincus, E., & Morency, L. (2016). Multimodal sentiment intensity analysis in videos: Facial gestures and verbal messages. IEEE Intelligent Systems, 31(6), 82\u201388.","journal-title":"IEEE Intelligent Systems"},{"key":"2304_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, H., Wang, Y., Yin, G., Liu, K., Liu, Y., & Yu, T. (2023). Learning language-guided adaptive hyper-modality representation for multimodal sentiment analysis. In Proceedings of the 2023 conference on empirical methods in natural language processing, EMNLP 2023, Singapore, December 6\u201310, 2023 (pp. 756\u2013767). Association for Computational Linguistics.","DOI":"10.18653\/v1\/2023.emnlp-main.49"},{"key":"2304_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Xu, Y., Zhang, J., & Tao, D. (2022). Vitaev2: Vision transformer advanced by exploring inductive bias for image recognition and beyond. abs\/2202.10108","DOI":"10.1007\/s11263-022-01739-w"},{"key":"2304_CR61","doi-asserted-by":"crossref","unstructured":"Zhang, S., Zhang, S., Huang, T., Gao, W., & Tian, Q. (2018). Learning affective features with a hybrid deep model for audio-visual emotion recognition. IEEE Transactions on Circuits and Systems for Video Technology, 28(10), 3030\u20133043.","DOI":"10.1109\/TCSVT.2017.2719043"},{"issue":"12","key":"2304_CR62","doi-asserted-by":"publisher","first-page":"5586","DOI":"10.1109\/TKDE.2021.3070203","volume":"34","author":"Y Zhang","year":"2022","unstructured":"Zhang, Y., & Yang, Q. (2022). A survey on multi-task learning. IEEE Transactions on Knowledge and Data Engineering, 34(12), 5586\u20135609.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"2304_CR63","doi-asserted-by":"crossref","unstructured":"Zhao, S., Ma, Y., Gu, Y., Yang, J., Xing, T., Xu, P., & Keutzer, K. (2020). An end-to-end visual-audio attention network for emotion recognition in user-generated videos. In Proceedings of the AAAI conference on artificial intelligence (Vol.\u00a034, pp. 303\u2013311). AAAI Press.","DOI":"10.1609\/aaai.v34i01.5364"},{"key":"2304_CR64","doi-asserted-by":"publisher","first-page":"2617","DOI":"10.1109\/TASLP.2021.3096037","volume":"29","author":"H Zhou","year":"2021","unstructured":"Zhou, H., Du, J., Zhang, Y., Wang, Q., Liu, Q., & Lee, C. (2021). Information fusion in attention networks using adaptive and multi-level factorized bilinear pooling for audio-visual emotion recognition. IEEE\/ACM Transactions on Audio, Speech, and Language Processing, 29, 2617\u20132629.","journal-title":"IEEE\/ACM Transactions on Audio, Speech, and Language Processing"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02304-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02304-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02304-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,17]],"date-time":"2025-04-17T06:02:16Z","timestamp":1744869736000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02304-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,22]]},"references-count":64,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["2304"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02304-3","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,22]]},"assertion":[{"value":"19 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 November 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 December 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declaration"}},{"value":"The authors have no Conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}