{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,8]],"date-time":"2026-02-08T11:18:49Z","timestamp":1770549529420,"version":"3.49.0"},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2024,4,12]],"date-time":"2024-04-12T00:00:00Z","timestamp":1712880000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,4,12]],"date-time":"2024-04-12T00:00:00Z","timestamp":1712880000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["72132008"],"award-info":[{"award-number":["72132008"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-19152-5","type":"journal-article","created":{"date-parts":[[2024,4,12]],"date-time":"2024-04-12T07:03:21Z","timestamp":1712905401000},"page":"1385-1402","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Transformer-based adaptive contrastive learning for multimodal sentiment analysis"],"prefix":"10.1007","volume":"84","author":[{"given":"Yifan","family":"Hu","sequence":"first","affiliation":[]},{"given":"Xi","family":"Huang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2250-3098","authenticated-orcid":false,"given":"Xianbing","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Hai","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Rong","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,4,12]]},"reference":[{"key":"19152_CR1","unstructured":"Ngiam J, Khosla A, Kim M et\u00a0al (2011) Multimodal deep learning. In: Proceedings of the 28th international conference on machine learning (ICML-11), pp 689\u2013696"},{"key":"19152_CR2","doi-asserted-by":"crossref","unstructured":"Yu W, Xu H, Yuan Z et\u00a0al (2021) Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. In: Proceedings of the AAAI conference on artificial intelligence, pp 10790\u201310797","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"19152_CR3","doi-asserted-by":"crossref","unstructured":"Hazarika D, Zimmermann R, Poria S (2020) Misa: Modality-invariant and-specific representations for multimodal sentiment analysis. In: Proceedings of the 28th ACM international conference on multimedia, pp 1122\u20131131","DOI":"10.1145\/3394171.3413678"},{"key":"19152_CR4","unstructured":"Devlin J, Chang MW, Lee K et\u00a0al (2018) Bert: Pre-training of deep bidirectional transformers for language understanding. Preprint arXiv:1810.04805"},{"key":"19152_CR5","doi-asserted-by":"crossref","unstructured":"Yu W, Xu H, Meng F et\u00a0al (2020) Ch-sims: A chinese multimodal sentiment analysis dataset with fine-grained annotation of modality. In: Proceedings of the 58th annual meeting of the association for computational linguistics, pp 3718\u20133727","DOI":"10.18653\/v1\/2020.acl-main.343"},{"key":"19152_CR6","doi-asserted-by":"publisher","first-page":"184","DOI":"10.1016\/j.inffus.2020.09.005","volume":"66","author":"D Gkoumas","year":"2021","unstructured":"Gkoumas D, Li Q, Lioma C et al (2021) What makes the difference? an empirical comparison of fusion strategies for multimodal language analysis. Inf Fusion 66:184\u2013197","journal-title":"Inf Fusion"},{"key":"19152_CR7","doi-asserted-by":"crossref","unstructured":"Liu Y, Yuan Z, Mao H et\u00a0al (2022) Make acoustic and visual cues matter: Ch-sims v2. 0 dataset and av-mixup consistent module. In: Proceedings of the 2022 international conference on multimodal interaction, pp 247\u2013258","DOI":"10.1145\/3536221.3556630"},{"key":"19152_CR8","doi-asserted-by":"crossref","unstructured":"Tsai YHH, Bai S, Liang PP, et\u00a0al (2019) Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the conference. Association for Computational Linguistics. Meeting, NIH Public Access, p 6558","DOI":"10.18653\/v1\/P19-1656"},{"key":"19152_CR9","doi-asserted-by":"publisher","first-page":"107676","DOI":"10.1016\/j.knosys.2021.107676","volume":"235","author":"T Wu","year":"2022","unstructured":"Wu T, Peng J, Zhang W et al (2022) Video sentiment analysis with bimodal information-augmented multi-head attention. Knowl-Based Syst 235:107676","journal-title":"Knowl-Based Syst"},{"key":"19152_CR10","doi-asserted-by":"crossref","unstructured":"Han W, Chen H, Poria S (2021) Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. Preprint arXiv:2109.00412","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"issue":"2","key":"19152_CR11","doi-asserted-by":"publisher","first-page":"736","DOI":"10.1109\/TNNLS.2020.2979225","volume":"32","author":"D Wang","year":"2020","unstructured":"Wang D, Jing B, Lu C et al (2020) Coarse alignment of topic and sentiment: A unified model for cross-lingual sentiment classification. IEEE Trans Neural Netw Learn Syst 32(2):736\u2013747","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"19152_CR12","unstructured":"Gutmann M, Hyv\u00e4rinen A (2010) Noise-contrastive estimation: A new estimation principle for unnormalized statistical models. In: Proceedings of the thirteenth international conference on artificial intelligence and statistics, JMLR workshop and conference proceedings, pp 297\u2013304"},{"key":"19152_CR13","first-page":"18661","volume":"33","author":"P Khosla","year":"2020","unstructured":"Khosla P, Teterwak P, Wang C et al (2020) Supervised contrastive learning. Adv Neural Inf Process Syst 33:18661\u201318673","journal-title":"Adv Neural Inf Process Syst"},{"key":"19152_CR14","doi-asserted-by":"crossref","unstructured":"Gao T, Yao X, Chen D (2021) Simcse: Simple contrastive learning of sentence embeddings. Preprint arXiv:2104.08821","DOI":"10.18653\/v1\/2021.emnlp-main.552"},{"key":"19152_CR15","doi-asserted-by":"crossref","unstructured":"Hu G, Lin TE, Zhao Y et\u00a0al (2022) Unimse: Towards unified multimodal sentiment analysis and emotion recognition. Preprint arXiv:2211.11256","DOI":"10.18653\/v1\/2022.emnlp-main.534"},{"key":"19152_CR16","doi-asserted-by":"crossref","unstructured":"Li Z, Xu B, Zhu C et\u00a0al (2022) Clmlf: a contrastive learning and multi-layer fusion method for multimodal sentiment detection. Preprint arXiv:2204.05515","DOI":"10.18653\/v1\/2022.findings-naacl.175"},{"key":"19152_CR17","doi-asserted-by":"crossref","unstructured":"Morency LP, Mihalcea R, Doshi P (2011) Towards multimodal sentiment analysis: Harvesting opinions from the web. In: Proceedings of the 13th international conference on multimodal interfaces, pp 169\u2013176","DOI":"10.1145\/2070481.2070509"},{"key":"19152_CR18","doi-asserted-by":"crossref","unstructured":"Zadeh A, Chen M, Poria S et\u00a0al (2017) Tensor fusion network for multimodal sentiment analysis. Preprint arXiv:1707.07250","DOI":"10.18653\/v1\/D17-1115"},{"key":"19152_CR19","doi-asserted-by":"crossref","unstructured":"Liu Z, Shen Y, Lakshminarasimhan VB et\u00a0al (2018) Efficient low-rank multimodal fusion with modality-specific factors. Preprint arXiv:1806.00064","DOI":"10.18653\/v1\/P18-1209"},{"key":"19152_CR20","unstructured":"Vaswani A, Shazeer N, Parmar N et\u00a0al (2017) Attention is all you need. Advances in neural information processing systems 30"},{"key":"19152_CR21","doi-asserted-by":"crossref","unstructured":"Sun C, Myers A, Vondrick C et\u00a0al (2019) Videobert: A joint model for video and language representation learning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7464\u20137473","DOI":"10.1109\/ICCV.2019.00756"},{"key":"19152_CR22","doi-asserted-by":"crossref","unstructured":"Rahman W, Hasan MK, Lee S et\u00a0al (2020) Integrating multimodal information in large pretrained transformers. In: Proceedings of the conference. Association for Computational Linguistics. Meeting, NIH Public Access, p 2359","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"19152_CR23","doi-asserted-by":"publisher","first-page":"985","DOI":"10.1109\/TASLP.2021.3049898","volume":"29","author":"Z Lian","year":"2021","unstructured":"Lian Z, Liu B, Tao J (2021) Ctnet: Conversational transformer network for emotion recognition. IEEE\/ACM Trans Audio Speech Lang Process 29:985\u20131000","journal-title":"IEEE\/ACM Trans Audio Speech Lang Process"},{"key":"19152_CR24","unstructured":"Wang W, Han C, Zhou T, et\u00a0al (2022) Visual recognition with deep nearest centroids. Preprint arXiv:2209.07383"},{"key":"19152_CR25","doi-asserted-by":"crossref","unstructured":"Han W, Chen H, Gelbukh A et\u00a0al (2021) Bi-bimodal modality fusion for correlation-controlled multimodal sentiment analysis. In: Proceedings of the 2021 international conference on multimodal interaction, pp 6\u201315","DOI":"10.1145\/3462244.3479919"},{"key":"19152_CR26","doi-asserted-by":"crossref","unstructured":"Yuan Z, Li W, Xu H et\u00a0al (2021) Transformer-based feature reconstruction network for robust multimodal sentiment analysis. In: Proceedings of the 29th ACM international conference on multimedia, pp 4400\u20134407","DOI":"10.1145\/3474085.3475585"},{"issue":"10","key":"19152_CR27","doi-asserted-by":"publisher","first-page":"6642","DOI":"10.1109\/TCSVT.2022.3177320","volume":"32","author":"L Yan","year":"2022","unstructured":"Yan L, Ma S, Wang Q et al (2022) Video captioning using global-local representation. IEEE Trans Circuits Syst Video Technol 32(10):6642\u20136656","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"19152_CR28","doi-asserted-by":"publisher","first-page":"109259","DOI":"10.1016\/j.patcog.2022.109259","volume":"136","author":"D Wang","year":"2023","unstructured":"Wang D, Guo X, Tian Y et al (2023) Tetfn: A text enhanced transformer fusion network for multimodal sentiment analysis. Pattern Recognit 136:109259","journal-title":"Pattern Recognit"},{"key":"19152_CR29","doi-asserted-by":"crossref","unstructured":"Wu Y, Zhao Y, Yang H et\u00a0al (2022) Sentiment word aware multimodal refinement for multimodal sentiment analysis with asr errors. Preprint arXiv:2203.00257","DOI":"10.18653\/v1\/2022.findings-acl.109"},{"key":"19152_CR30","doi-asserted-by":"crossref","unstructured":"Zadeh AB, Liang PP, Poria S et\u00a0al (2018) Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th annual meeting of the association for computational linguistics (vol 1: Long Papers), pp 2236\u20132246","DOI":"10.18653\/v1\/P18-1208"},{"key":"19152_CR31","doi-asserted-by":"crossref","unstructured":"Hao X, Zhu Y, Appalaraju S et\u00a0al (2023) Mixgen: A new multi-modal data augmentation. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 379\u2013389","DOI":"10.1109\/WACVW58289.2023.00042"},{"key":"19152_CR32","doi-asserted-by":"publisher","unstructured":"Zhang Y, Yang Q (2022) A survey on multi-task learning. IEEE Trans Knowl Data Eng 5586\u20135609. https:\/\/doi.org\/10.1109\/tkde.2021.3070203,","DOI":"10.1109\/tkde.2021.3070203"},{"key":"19152_CR33","doi-asserted-by":"crossref","unstructured":"Kendall A, Gal Y, Cipolla R (2018) Multi-task learning using uncertainty to weigh losses for scene geometry and semantics. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7482\u20137491","DOI":"10.1109\/CVPR.2018.00781"},{"key":"19152_CR34","unstructured":"Liebel L, K\u00f6rner M (2018) Auxiliary tasks in multi-task learning. Preprint arXiv:1805.06334"},{"key":"19152_CR35","unstructured":"Tsai YHH, Liang PP, Zadeh A et\u00a0al (2018) Learning factorized multimodal representations. Preprint arXiv:1806.06176"},{"key":"19152_CR36","unstructured":"Liu Y, Ott M, Goyal N et\u00a0al (2019) Roberta: A robustly optimized bert pretraining approach. Preprint arXiv:1907.11692"},{"key":"19152_CR37","doi-asserted-by":"crossref","unstructured":"Zadeh A, Liang PP, Mazumder N et\u00a0al (2018) Memory fusion network for multi-view sequential learning. In: Proceedings of the AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v32i1.12021"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-19152-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-19152-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-19152-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,28]],"date-time":"2025-01-28T13:08:21Z","timestamp":1738069701000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-19152-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,12]]},"references-count":37,"journal-issue":{"issue":"3","published-online":{"date-parts":[[2025,1]]}},"alternative-id":["19152"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-19152-5","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,4,12]]},"assertion":[{"value":"21 January 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 March 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 April 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 April 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare they have no conflict of interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}