{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T05:30:45Z","timestamp":1778131845234,"version":"3.51.4"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2024,10,10]],"date-time":"2024-10-10T00:00:00Z","timestamp":1728518400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,10]],"date-time":"2024-10-10T00:00:00Z","timestamp":1728518400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LQ23F020021"],"award-info":[{"award-number":["LQ23F020021"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1007\/s00371-024-03668-w","type":"journal-article","created":{"date-parts":[[2024,10,10]],"date-time":"2024-10-10T12:11:09Z","timestamp":1728562269000},"page":"4405-4418","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["CTHFNet: contrastive translation and hierarchical fusion network for text\u2013video\u2013audio sentiment analysis"],"prefix":"10.1007","volume":"41","author":[{"given":"Qiaohong","family":"Chen","sequence":"first","affiliation":[]},{"given":"Shufan","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Xian","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Sun","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,10]]},"reference":[{"key":"3668_CR1","doi-asserted-by":"publisher","first-page":"101490","DOI":"10.1016\/j.csl.2023.101490","volume":"80","author":"J Wang","year":"2023","unstructured":"Wang, J., Yang, S., Zhao, H., Yang, Y.: Social media popularity prediction with multimodal hierarchical fusion model. Comput. Speech Language 80, 101490 (2023)","journal-title":"Comput. Speech Language"},{"key":"3668_CR2","doi-asserted-by":"publisher","unstructured":"Li, H., Guo, A., Li, Y.: CCMA: CapsNet for audio\u2013video sentiment analysis using cross-modal attention. Visual Comput. (2024). https:\/\/doi.org\/10.1007\/s00371-024-03453-9","DOI":"10.1007\/s00371-024-03453-9"},{"key":"3668_CR3","doi-asserted-by":"publisher","first-page":"101402","DOI":"10.1016\/j.csl.2022.101402","volume":"76","author":"R Mokhosi","year":"2022","unstructured":"Mokhosi, R., Shikali, C., Qin, Z., Liu, Q.: Maximal activation weighted memory for aspect based sentiment analysis. Comput. Speech Language 76, 101402 (2022)","journal-title":"Comput. Speech Language"},{"key":"3668_CR4","doi-asserted-by":"crossref","unstructured":"Liu, H., Wang, W., Li, H.: Towards multi-modal sarcasm detection via hierarchical congruity modeling with knowledge enhancement, (2022) arXiv preprint arXiv:2210.03501","DOI":"10.18653\/v1\/2022.emnlp-main.333"},{"key":"3668_CR5","doi-asserted-by":"publisher","unstructured":"Murthy, J.S., Siddesh, G.M.: A smart video analytical framework for sarcasm detection using novel adaptive fusion network and sarcasnet-99 model. Visual Comput. (2024). https:\/\/doi.org\/10.1007\/s00371-023-03224-y","DOI":"10.1007\/s00371-023-03224-y"},{"key":"3668_CR6","doi-asserted-by":"crossref","unstructured":"Hu, G., Lin, T.-E., Zhao, Y., Lu, G., Wu, Y., Li, Y.: UNIMSE: towards unified multimodal sentiment analysis and emotion recognition. In: Conference on empirical methods in natural language processing, pp. 7837\u20137851 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.534"},{"issue":"3\u20134","key":"3668_CR7","doi-asserted-by":"publisher","first-page":"2072","DOI":"10.1002\/cav.2072","volume":"33","author":"J Dai","year":"2022","unstructured":"Dai, J., Zhang, X.: Automatic image caption generation using deep learning and multimodal attention. Comput. Anim. Virtual Worlds 33(3\u20134), 2072 (2022)","journal-title":"Comput. Anim. Virtual Worlds"},{"key":"3668_CR8","doi-asserted-by":"publisher","first-page":"107134","DOI":"10.1016\/j.knosys.2021.107134","volume":"226","author":"M Birjali","year":"2021","unstructured":"Birjali, M., Kasri, M., Beni-Hssane, A.: A comprehensive survey on sentiment analysis: approaches, challenges and trends. Knowl.-Based Syst. 226, 107134 (2021)","journal-title":"Knowl.-Based Syst."},{"key":"3668_CR9","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Chen, M., Poria, S., Cambria, E., Morency, L.-P.: Tensor fusion network for multimodal sentiment analysis. In: Conference on empirical methods in natural language processing, pp. 1103\u20131114 (2017)","DOI":"10.18653\/v1\/D17-1115"},{"key":"3668_CR10","doi-asserted-by":"crossref","unstructured":"Chen, M., Li, X.: SWAFN: sentimental words aware fusion network for multimodal sentiment analysis. In: Proceedings of the 28th international conference on computational linguistics, pp. 1067\u20131077 (2020)","DOI":"10.18653\/v1\/2020.coling-main.93"},{"key":"3668_CR11","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding, pp. 4171\u20134186 (2018)"},{"key":"3668_CR12","doi-asserted-by":"publisher","first-page":"130","DOI":"10.1016\/j.neucom.2021.09.041","volume":"467","author":"B Yang","year":"2022","unstructured":"Yang, B., Shao, B., Wu, L., Lin, X.: Multimodal sentiment analysis with unidirectional modality translation. Neurocomputing 467, 130\u2013137 (2022)","journal-title":"Neurocomputing"},{"key":"3668_CR13","doi-asserted-by":"crossref","unstructured":"Chen, M., Wang, S., Liang, P.P., Baltru\u0161aitis, T., Zadeh, A., Morency, L.-P.: Multimodal sentiment analysis with word-level fusion and reinforcement learning. In: Proceedings of the 19th ACM international conference on multimodal interaction, pp. 163\u2013171 (2017)","DOI":"10.1145\/3136755.3136801"},{"key":"3668_CR14","doi-asserted-by":"crossref","unstructured":"Sun, Z., Sarma, P., Sethares, W., Liang, Y.: Learning relationships between text, audio, and video via deep canonical correlation for multimodal language analysis. In: Proceedings of the AAAI conference on artificial intelligence, vol. 34, pp. 8992\u20138999 (2020)","DOI":"10.1609\/aaai.v34i05.6431"},{"key":"3668_CR15","doi-asserted-by":"crossref","unstructured":"Pham, H., Liang, P.P., Manzini, T., Morency, L.-P., P\u00f3czos, B.: Found in translation: learning robust joint representations by cyclic translations between modalities. In: Proceedings of the AAAI conference on artificial intelligence, vol. 33, pp. 6892\u20136899 (2019)","DOI":"10.1609\/aaai.v33i01.33016892"},{"issue":"1","key":"3668_CR16","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1007\/s12559-022-10073-9","volume":"15","author":"F Wang","year":"2023","unstructured":"Wang, F., Tian, S., Yu, L., Liu, J., Wang, J., Li, K., Wang, Y.: TEDT: transformer-based encoding-decoding translation network for multimodal sentiment analysis. Cogn. Comput. 15(1), 289\u2013303 (2023)","journal-title":"Cogn. Comput."},{"key":"3668_CR17","doi-asserted-by":"crossref","unstructured":"Tsai, Y.-H.H., Bai, S., Liang, P.P., Kolter, J.Z., Morency, L.-P., Salakhutdinov, R.: Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the conference. Association for computational linguistics. Meeting, vol. 2019, p. 6558 (2019). NIH Public Access","DOI":"10.18653\/v1\/P19-1656"},{"key":"3668_CR18","doi-asserted-by":"crossref","unstructured":"Hazarika, D., Zimmermann, R., Poria, S.: Misa: Modality-invariant and-specific representations for multimodal sentiment analysis. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1122\u20131131 (2020)","DOI":"10.1145\/3394171.3413678"},{"key":"3668_CR19","doi-asserted-by":"crossref","unstructured":"Sun, H., Wang, H., Liu, J., Chen, Y.-W., Lin, L.: CubeMLP: an mlp-based model for multimodal sentiment analysis and depression estimation. In: Proceedings of the 30th ACM international conference on multimedia, pp. 3722\u20133729 (2022)","DOI":"10.1145\/3503161.3548025"},{"key":"3668_CR20","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/TMM.2021.3120873","volume":"25","author":"X Lin","year":"2021","unstructured":"Lin, X., Sun, S., Huang, W., Sheng, B., Li, P., Feng, D.D.: EAPT: efficient attention pyramid transformer for image processing. IEEE Trans. Multimedia 25, 50\u201361 (2021)","journal-title":"IEEE Trans. Multimedia"},{"issue":"1","key":"3668_CR21","doi-asserted-by":"publisher","first-page":"163","DOI":"10.1109\/TII.2021.3085669","volume":"18","author":"J Li","year":"2021","unstructured":"Li, J., Chen, J., Sheng, B., Li, P., Yang, P., Feng, D.D., Qi, J.: Automatic detection and classification system of domestic waste via multimodel cascaded convolutional neural network. IEEE Trans. Industr. Inf. 18(1), 163\u2013173 (2021)","journal-title":"IEEE Trans. Industr. Inf."},{"key":"3668_CR22","doi-asserted-by":"crossref","unstructured":"Han, W., Chen, H., Poria, S.: Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. In: Conference on empirical methods in natural language processing, pp. 9180\u20139192 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"3668_CR23","doi-asserted-by":"crossref","unstructured":"Morency, L.-P., Mihalcea, R., Doshi, P.: Towards multimodal sentiment analysis: Harvesting opinions from the web. In: Proceedings of the 13th international conference on multimodal interfaces, pp. 169\u2013176 (2011)","DOI":"10.1145\/2070481.2070509"},{"key":"3668_CR24","doi-asserted-by":"crossref","unstructured":"Liu, Z., Shen, Y., Lakshminarasimhan, V.B., Liang, P.P., Zadeh, A., Morency, L.-P.: Efficient low-rank multimodal fusion with modality-specific factors. In: Annual meeting of the association for computational linguistics, pp. 2247\u20132256 (2018)","DOI":"10.18653\/v1\/P18-1209"},{"key":"3668_CR25","unstructured":"Tsai, Y.-H.H., Liang, P.P., Zadeh, A., Morency, L.-P., Salakhutdinov, R.: Learning factorized multimodal representations. In: International conference on learning representations (2019)"},{"key":"3668_CR26","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Liang, P.P., Mazumder, N., Poria, S., Cambria, E., Morency, L.-P.: Memory fusion network for multi-view sequential learning. In: Proceedings of the AAAI conference on artificial intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"3668_CR27","doi-asserted-by":"crossref","unstructured":"Yu, W., Xu, H., Yuan, Z., Wu, J.: Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. In: Proceedings of the AAAI conference on artificial intelligence, vol. 35, pp. 10790\u201310797 (2021)","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"3668_CR28","doi-asserted-by":"publisher","first-page":"127201","DOI":"10.1016\/j.neucom.2023.127201","volume":"571","author":"Y Fu","year":"2024","unstructured":"Fu, Y., Zhang, Z., Yang, R., Yao, C.: Hybrid cross-modal interaction learning for multimodal sentiment analysis. Neurocomputing 571, 127201 (2024)","journal-title":"Neurocomputing"},{"issue":"13s","key":"3668_CR29","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3586075","volume":"55","author":"R Das","year":"2023","unstructured":"Das, R., Singh, T.D.: Multimodal sentiment analysis: a survey of methods, trends, and challenges. ACM Comput. Surv. 55(13s), 1\u201338 (2023)","journal-title":"ACM Comput. Surv."},{"key":"3668_CR30","doi-asserted-by":"crossref","unstructured":"Han, W., Chen, H., Gelbukh, A., Zadeh, A., Morency, L.-p., Poria, S.: Bi-bimodal modality fusion for correlation-controlled multimodal sentiment analysis. In: Proceedings of the 2021 international conference on multimodal interaction, pp. 6\u201315 (2021)","DOI":"10.1145\/3462244.3479919"},{"key":"3668_CR31","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"3668_CR32","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International conference on machine learning, pp. 1597\u20131607 (2020). PMLR"},{"key":"3668_CR33","doi-asserted-by":"crossref","unstructured":"Ma, Y., Li, L., Chen, H., Li, X., Chen, J., Zhu, P., Peng, T., Pan, X.: Highlight removal from a single image based on a prior knowledge guided unsupervised cyclegan. In: Computer graphics international conference, pp. 388\u2013399 (2023). Springer","DOI":"10.1007\/978-3-031-50069-5_32"},{"key":"3668_CR34","first-page":"18661","volume":"33","author":"P Khosla","year":"2020","unstructured":"Khosla, P., Teterwak, P., Wang, C., Sarna, A., Tian, Y., Isola, P., Maschinot, A., Liu, C., Krishnan, D.: Supervised contrastive learning. Adv. Neural. Inf. Process. Syst. 33, 18661\u201318673 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3668_CR35","unstructured":"Lin, Z., Liang, B., Long, Y., Dang, Y., Yang, M., Zhang, M., Xu, R.: Modeling intra-and inter-modal relations: Hierarchical graph contrastive learning for multimodal sentiment analysis. In: Proceedings of the 29th international conference on computational linguistics, pp. 7124\u20137135 (2022)"},{"key":"3668_CR36","unstructured":"Wang, J., Yu, L.-C., Zhang, X.: Softmcl: Soft momentum contrastive learning for fine-grained sentiment-aware pre-training. In: International conference on computational linguistics, pp. 15012\u201315023 (2024)"},{"key":"3668_CR37","doi-asserted-by":"crossref","unstructured":"Yang, Y., Dong, X., Qiang, Y.: CLGSI: a multimodal sentiment analysis framework based on contrastive learning guided by sentiment intensity. In: Findings of the association for computational linguistics: NAACL 2024, pp. 2099\u20132110 (2024)","DOI":"10.18653\/v1\/2024.findings-naacl.135"},{"key":"3668_CR38","doi-asserted-by":"crossref","unstructured":"Baltru\u0161aitis, T., Robinson, P., Morency, L.-P.: Openface: an open source facial behavior analysis toolkit. In: 2016 IEEE winter conference on applications of computer vision (WACV), pp. 1\u201310 (2016). IEEE","DOI":"10.1109\/WACV.2016.7477553"},{"key":"3668_CR39","doi-asserted-by":"crossref","unstructured":"Degottex, G., Kane, J., Drugman, T., Raitio, T., Scherer, S.: Covarep-a collaborative voice analysis repository for speech technologies. In: 2014 IEEE international conference on acoustics, speech and signal processing (ICSDDP), pp. 960\u2013964 (2014). IEEE","DOI":"10.1109\/ICASSP.2014.6853739"},{"key":"3668_CR40","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"key":"3668_CR41","unstructured":"DeVries, T., Taylor, G.W.: Improved regularization of convolutional neural networks with cutout. arXiv preprint arXiv:1708.04552 (2017)"},{"issue":"8","key":"3668_CR42","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"3668_CR43","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"issue":"6","key":"3668_CR44","doi-asserted-by":"publisher","first-page":"2230","DOI":"10.1093\/comjnl\/bxae002","volume":"67","author":"Y Lei","year":"2024","unstructured":"Lei, Y., Qu, K., Zhao, Y., Han, Q., Wang, X.: Multimodal sentiment analysis based on composite hierarchical fusion. Comput. J. 67(6), 2230\u20132245 (2024). https:\/\/doi.org\/10.1093\/comjnl\/bxae002","journal-title":"Comput. J."},{"key":"3668_CR45","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE international conference on computer vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"issue":"6","key":"3668_CR46","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MIS.2016.94","volume":"31","author":"A Zadeh","year":"2016","unstructured":"Zadeh, A., Zellers, R., Pincus, E., Morency, L.-P.: Multimodal sentiment intensity analysis in videos: Facial gestures and verbal messages. IEEE Intell. Syst. 31(6), 82\u201388 (2016)","journal-title":"IEEE Intell. Syst."},{"key":"3668_CR47","unstructured":"Zadeh, A.B., Liang, P.P., Poria, S., Cambria, E., Morency, L.-P.: Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th annual meeting of the association for computational linguistics (Volume 1: Long Papers), pp. 2236\u20132246 (2018)"},{"key":"3668_CR48","doi-asserted-by":"crossref","unstructured":"Wang, Y., Shen, Y., Liu, Z., Liang, P.P., Zadeh, A., Morency, L.-P.: Words can shift: dynamically adjusting word representations using nonverbal behaviors. In: Proceedings of the AAAI conference on artificial intelligence, vol. 33, pp. 7216\u20137223 (2019)","DOI":"10.1609\/aaai.v33i01.33017216"},{"key":"3668_CR49","doi-asserted-by":"crossref","unstructured":"Rahman, W., Hasan, M.K., Lee, S., Zadeh, A., Mao, C., Morency, L.-P., Hoque, E.: Integrating multimodal information in large pretrained transformers. In: Proceedings of the conference. Association for computational linguistics. Meeting, vol. 2020, p. 2359 (2020). NIH Public Access","DOI":"10.18653\/v1\/2020.acl-main.214"},{"issue":"3\u20134","key":"3668_CR50","doi-asserted-by":"publisher","first-page":"2090","DOI":"10.1002\/cav.2090","volume":"33","author":"H Wang","year":"2022","unstructured":"Wang, H., Yang, M., Li, Z., Liu, Z., Hu, J., Fu, Z., Liu, F.: SCANET: improving multimodal representation and fusion with sparse-and cross-attention for multimodal sentiment analysis. Comput. Anim. Virtual Worlds 33(3\u20134), 2090 (2022)","journal-title":"Comput. Anim. Virtual Worlds"},{"key":"3668_CR51","doi-asserted-by":"publisher","first-page":"107676","DOI":"10.1016\/j.knosys.2021.107676","volume":"235","author":"T Wu","year":"2022","unstructured":"Wu, T., Peng, J., Zhang, W., Zhang, H., Tan, S., Yi, F., Ma, C., Huang, Y.: Video sentiment analysis with bimodal information-augmented multi-head attention. Knowl.-Based Syst. 235, 107676 (2022)","journal-title":"Knowl.-Based Syst."},{"key":"3668_CR52","doi-asserted-by":"crossref","unstructured":"Hwang, Y., Kim, J.-H.: Self-supervised unimodal label generation strategy using recalibrated modality representations for multimodal sentiment analysis. In: Findings of the association for computational linguistics: EACL 2023, pp. 35\u201346 (2023)","DOI":"10.18653\/v1\/2023.findings-eacl.2"},{"key":"3668_CR53","doi-asserted-by":"crossref","unstructured":"Li, M., Yang, D., Lei, Y., Wang, S., Wang, S., Su, L., Yang, K., Wang, Y., Sun, M., Zhang, L.: A unified self-distillation framework for multimodal sentiment analysis with uncertain missing modalities. In: Proceedings of the AAAI conference on artificial intelligence, vol. 38, pp. 10074\u201310082 (2024)","DOI":"10.1609\/aaai.v38i9.28871"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03668-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-024-03668-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03668-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T10:01:56Z","timestamp":1745488916000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-024-03668-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,10]]},"references-count":53,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["3668"],"URL":"https:\/\/doi.org\/10.1007\/s00371-024-03668-w","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,10]]},"assertion":[{"value":"24 September 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 October 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}