{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T21:16:59Z","timestamp":1776374219748,"version":"3.51.2"},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"19","license":[{"start":{"date-parts":[[2024,9,20]],"date-time":"2024-09-20T00:00:00Z","timestamp":1726790400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,20]],"date-time":"2024-09-20T00:00:00Z","timestamp":1726790400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"the Fundamental Research Funds for the Central Universities","award":["N2224005-1"],"award-info":[{"award-number":["N2224005-1"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Soft Comput"],"published-print":{"date-parts":[[2024,10]]},"DOI":"10.1007\/s00500-024-09886-7","type":"journal-article","created":{"date-parts":[[2024,9,20]],"date-time":"2024-09-20T09:02:03Z","timestamp":1726822923000},"page":"10897-10910","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["MATF: main-auxiliary transformer fusion for multi-modal sentiment analysis"],"prefix":"10.1007","volume":"28","author":[{"given":"Qing","family":"Zhong","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinhui","family":"Shao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,9,20]]},"reference":[{"key":"9886_CR1","doi-asserted-by":"crossref","unstructured":"Arora S, May A, Zhang J et al. (2020) Contextual embeddings: when are they worth it?. Annual meeting of the association for computational linguistics","DOI":"10.18653\/v1\/2020.acl-main.236"},{"key":"9886_CR2","doi-asserted-by":"crossref","unstructured":"Cai GY, Xia BB (2015) Convolutional neural networks for multimedia sentiment analysis. In: Proceedings of natural language processing and chinese computing, pp 159--167","DOI":"10.1007\/978-3-319-25207-0_14"},{"key":"9886_CR3","doi-asserted-by":"crossref","unstructured":"Degottex G, Kane J, Drugman T et al. (2014) COVAREP \u2014 A collaborative voice analysis repository for speech technologies. In: 2014 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 960\u2013964","DOI":"10.1109\/ICASSP.2014.6853739"},{"key":"9886_CR4","unstructured":"Delbrouck JB, Tits N, Brousmiche M et al. A Transformer-based joint-encoding for emotion recognition and sentiment analysis. https:\/\/arxiv.org\/abs\/2006.15955"},{"key":"9886_CR5","unstructured":"Devlin J, Chang MW, Lee K et al. (2019) BERT: pre-training of deep bidirectional transformers for language understanding, https:\/\/arxiv.org\/abs\/1810.04805"},{"key":"9886_CR6","doi-asserted-by":"crossref","unstructured":"Dobri\u0161ek S, Gajsek R, Mihelic F et al. (2013) Towards efficient multi-modal emotion recognition. Int J Adv Robot Syst. 10","DOI":"10.5772\/54002"},{"key":"9886_CR7","doi-asserted-by":"publisher","first-page":"994","DOI":"10.1109\/TASL.2011.2170835","volume":"20","author":"T Drugman","year":"2012","unstructured":"Drugman T, Thomas MRP, Gu\u00f0nason J et al (2012) Detection of glottal closure instants from speech signals: a quantitative review[J]. IEEE Trans Audio Speech Lang Process 20:994\u20131006","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"9886_CR8","unstructured":"Drugman T, Alwan A (2019) Joint robust voicing detection and pitch estimation based on residual harmonics, Interspeech"},{"key":"9886_CR9","doi-asserted-by":"crossref","unstructured":"Han W, Chen H, Poria S (2021) Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. In: Conference on empirical methods in natural language processing","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"9886_CR10","unstructured":"Hazarika D, Zimmermann R, Poria S (2020) MISA: modality-invariant and -specific representations for multimodal sentiment analysis, Proceedings of the 28th ACM international conference on multimedia"},{"key":"9886_CR11","doi-asserted-by":"publisher","first-page":"1170","DOI":"10.1109\/TASL.2013.2245653","volume":"21","author":"J Kane","year":"2013","unstructured":"Kane J, Gobl C (2013) Wavelet maxima dispersion for breathy to tense voice discrimination[J]. IEEE Trans Audio Speech Lang Process 21:1170\u20131179","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"9886_CR12","doi-asserted-by":"crossref","unstructured":"Liu Z, Shen Y, Lakshminarasimhan VB et al. (2018) efficient low-rank multimodal fusion with modality-specific factors efficient low-rank multimodal fusion with modality-specific factors, https:\/\/arxiv.org\/abs\/1806.00064","DOI":"10.18653\/v1\/P18-1209"},{"key":"9886_CR13","unstructured":"Liu Y, Ott M, Goyal N et al. (2019) RoBERTa: a robustly optimized BERT pretraining approach, https:\/\/arxiv.org\/abs\/1907.11692"},{"key":"9886_CR14","unstructured":"Loshchilov I, Hutter F (2017) Fixing weight decay regularization in Adam, https:\/\/arxiv.org\/abs\/1711.05101"},{"key":"9886_CR15","doi-asserted-by":"crossref","unstructured":"Mai SJ, Hu HF, Xing SL (2019) Divide, conquer and combine: hierarchical feature fusion network with local and global perspectives for multimodal affective computing, In: Proceedings of annual meeting of the association for computational linguistics, pp 481\u2013492","DOI":"10.18653\/v1\/P19-1046"},{"key":"9886_CR16","unstructured":"Martins A, Astudillo R (2016) From softmax to sparsemax: a sparse model of attention and multi-label classification. In: International conference on machine learning, pp.1614\u20131623"},{"key":"9886_CR17","doi-asserted-by":"crossref","unstructured":"Monkaresi H, Hussain MS, Calvo RA (2012) Classification of affects using head movement, skin color features and physiological signals. In: Proceedings of 2012 IEEE international conference on systems, man, and cybernetics (SMC), pp 2664\u20132669","DOI":"10.1109\/ICSMC.2012.6378149"},{"key":"9886_CR18","unstructured":"Niculae V, Blondel M (2017) A regularized framework for sparse and structured neural attention, Adv Neural Inform Process Syst, 30"},{"key":"9886_CR19","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) GloVe: global vectors for word representation, In: Conference on empirical methods in natural language processing","DOI":"10.3115\/v1\/D14-1162"},{"key":"9886_CR20","unstructured":"Pham H, Liang PP, Manzini T et al. (2018) Found in translation: learning robust joint representations by cyclic translations between modalities, https:\/\/arxiv.org\/abs\/1812.07809"},{"key":"9886_CR21","doi-asserted-by":"crossref","unstructured":"Poria S, Cambria E, Hazarika D et al. (2017) Multi-level multiple attentions for contextual multimodal sentiment analysis. In: 2017 IEEE international conference on data mining (ICDM). 1033\u20131038","DOI":"10.1109\/ICDM.2017.134"},{"key":"9886_CR22","doi-asserted-by":"publisher","first-page":"28750","DOI":"10.1109\/ACCESS.2022.3157712","volume":"10","author":"Q Qi","year":"2022","unstructured":"Qi Q, Lin L, Zhang R et al (2022) MEDT: using multimodal encoding-decoding network as in transformer for multimodal sentiment analysis. IEEE Access 10:28750\u201328759","journal-title":"IEEE Access"},{"key":"9886_CR23","doi-asserted-by":"publisher","first-page":"4767437","DOI":"10.1155\/2022\/4767437","volume":"2022","author":"Z Quan","year":"2022","unstructured":"Quan Z, Sun T, Su M et al (2022) Multimodal sentiment analysis based on cross-modal attention and gated cyclic hierarchical fusion networks[J]. Comput Intell Neurosci 2022:4767437","journal-title":"Comput Intell Neurosci"},{"key":"9886_CR24","doi-asserted-by":"crossref","unstructured":"Rahman W, Hasan M, Lee S et al. (2020) Integrating multimodal information in large pretrained transformers, Proceedings of the conference. Association for computational linguistics. Meeting. 2020: 2359\u20132369","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"9886_CR25","doi-asserted-by":"crossref","unstructured":"Tsai YHH, Bai S, Liang PP et al. (2019) Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the conference. Association for Computational Linguistics. Meeting, pp 6558\u20136569","DOI":"10.18653\/v1\/P19-1656"},{"key":"9886_CR26","doi-asserted-by":"crossref","unstructured":"Wang Y, Shen Y, Liu Z et al. (2018) Words can shift: dynamically adjusting word representations using nonverbal behaviors. In: Proceedings of the AAAI conference on artificial intelligence. AAAI conference on artificial intelligence. 331: 7216\u20137223","DOI":"10.1609\/aaai.v33i01.33017216"},{"key":"9886_CR27","unstructured":"Wang YK, Huang WB, Sun FC et al. (2020) Deep multimodal fusion by channel exchanging. https:\/\/arxiv.org\/abs\/2011.05005"},{"key":"9886_CR28","unstructured":"Yang K, Xu H, Gao K (2022) CM-BERT: cross-modal BERT for text-audio sentiment analysis. In: Proceedings of the 28th ACM international conference on multimedia"},{"key":"9886_CR29","unstructured":"Yu WM, Xu H, Yuan Z et al. (2021) Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis, https:\/\/arxiv.org\/abs\/2102.04830"},{"key":"9886_CR30","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1109\/MIS.2016.94","volume":"31","author":"A Zadeh","year":"2016","unstructured":"Zadeh A, Zellers R, Pincus E et al (2016) Multimodal sentiment intensity analysis in videos: facial gestures and verbal messages. IEEE Intell Syst 31:82\u201388","journal-title":"IEEE Intell Syst"},{"key":"9886_CR31","doi-asserted-by":"crossref","unstructured":"Zadeh A, Chen MH, Poria S et al. (2017) Tensor fusion network for multimodal sentiment analysis, https:\/\/arxiv.org\/abs\/1707.07250","DOI":"10.18653\/v1\/D17-1115"},{"key":"9886_CR32","doi-asserted-by":"crossref","unstructured":"Zadeh A, Liang PP, Poria S et al. (2018) Multi-attention recurrent network for human communication comprehension. In: Proceedings of the AAAI conference on artificial intelligence. AAAI conference on artificial intelligence, pp 5642\u20135649","DOI":"10.1609\/aaai.v32i1.12024"},{"key":"9886_CR33","unstructured":"Zadeh A, Liang PP, Poria S, et al. (2018) Multimodal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph. In: The Proceedings of Annual Meeting of the Association for Computational Linguistics"},{"key":"9886_CR34","unstructured":"Zadeh A, Mao C, Shi K (2019) Factorized multimodal transformer for multimodal sequential learning. https:\/\/arxiv.org\/abs\/1911.09826"},{"key":"9886_CR35","doi-asserted-by":"publisher","first-page":"5586","DOI":"10.1109\/TKDE.2021.3070203","volume":"34","author":"Y Zhang","year":"2017","unstructured":"Zhang Y, Yang Q (2017) A survey on multi-task learning. IEEE Trans Knowl Data Eng 34:5586\u20135609","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"9886_CR36","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-022-03343-4","author":"Q Zhang","year":"2022","unstructured":"Zhang Q, Shi L, Liu P et al (2022) ICDN: integrating consistency and difference networks by transformer for multimodal sentiment analysis. Appl Intell. https:\/\/doi.org\/10.1007\/s10489-022-03343-4","journal-title":"Appl Intell"},{"issue":"6","key":"9886_CR37","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3596219","volume":"22","author":"S Zhao","year":"2023","unstructured":"Zhao S, Li Q, Yang Y et al (2023) From softmax to nucleusmax: a novel sparse language model for chinese radiology report summarization. ACM Trans Asian Low-Resour Lang Inform Process. 22(6):1\u201321","journal-title":"ACM Trans Asian Low-Resour Lang Inform Process."},{"key":"9886_CR38","doi-asserted-by":"crossref","unstructured":"Zhuang XQ, Liu FA, Hou J et al. (2022) Transformer-based interactive multi-modal attention network for video sentiment detection, Neural Process Lett, 1\u201318","DOI":"10.1007\/s11063-021-10713-5"}],"container-title":["Soft Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00500-024-09886-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00500-024-09886-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00500-024-09886-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,19]],"date-time":"2024-10-19T07:16:59Z","timestamp":1729322219000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00500-024-09886-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,20]]},"references-count":38,"journal-issue":{"issue":"19","published-print":{"date-parts":[[2024,10]]}},"alternative-id":["9886"],"URL":"https:\/\/doi.org\/10.1007\/s00500-024-09886-7","relation":{},"ISSN":["1432-7643","1433-7479"],"issn-type":[{"value":"1432-7643","type":"print"},{"value":"1433-7479","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,20]]},"assertion":[{"value":"7 May 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 September 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have not disclosed any competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}