{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T06:43:39Z","timestamp":1768286619595,"version":"3.49.0"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2022,12,21]],"date-time":"2022-12-21T00:00:00Z","timestamp":1671580800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,12,21]],"date-time":"2022-12-21T00:00:00Z","timestamp":1671580800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFC0833102"],"award-info":[{"award-number":["2020YFC0833102"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFC0833102"],"award-info":[{"award-number":["2020YFC0833102"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFC0833102"],"award-info":[{"award-number":["2020YFC0833102"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFC0833102"],"award-info":[{"award-number":["2020YFC0833102"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFC0833102"],"award-info":[{"award-number":["2020YFC0833102"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2020YFC0833102"],"award-info":[{"award-number":["2020YFC0833102"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2023,5]]},"DOI":"10.1007\/s11227-022-05001-5","type":"journal-article","created":{"date-parts":[[2022,12,21]],"date-time":"2022-12-21T03:02:30Z","timestamp":1671591750000},"page":"8611-8633","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["DBT: multimodal emotion recognition based on dual-branch transformer"],"prefix":"10.1007","volume":"79","author":[{"given":"Yufan","family":"Yi","sequence":"first","affiliation":[]},{"given":"Yan","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Cong","family":"He","sequence":"additional","affiliation":[]},{"given":"Yajing","family":"Fan","sequence":"additional","affiliation":[]},{"given":"Xinli","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Yiping","family":"Xu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,12,21]]},"reference":[{"key":"5001_CR1","unstructured":"Baevski A, Schneider S, Auli M (2019) vq-wav2vec: Self-supervised learning of discrete speech representations. http:\/\/arxiv.org\/abs\/1910.05453"},{"key":"5001_CR2","first-page":"12449","volume":"33","author":"A Baevski","year":"2020","unstructured":"Baevski A, Zhou Y, Mohamed A et al (2020) wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv Neural Inf Process Syst 33:12449\u201312460","journal-title":"Adv Neural Inf Process Syst"},{"issue":"4","key":"5001_CR3","doi-asserted-by":"publisher","first-page":"3795","DOI":"10.1007\/s11227-020-03412-w","volume":"77","author":"V Balakrishnan","year":"2021","unstructured":"Balakrishnan V, Lok PY, Abdul Rahim H (2021) A semi-supervised approach in detecting sentiment and emotion based on digital payment reviews. J Supercomput 77(4):3795\u20133810. https:\/\/doi.org\/10.1007\/s11227-020-03412-w","journal-title":"J Supercomput"},{"issue":"4","key":"5001_CR4","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso C, Bulut M, Lee CC et al (2008) Iemocap: interactive emotional dyadic motion capture database. Language Resour Eval 42(4):335\u2013359","journal-title":"Language Resour Eval"},{"key":"5001_CR5","unstructured":"Chen LW, Rudnicky A (2021) Exploring wav2vec 2.0 fine-tuning for improved speech emotion recognition. http:\/\/arxiv.org\/abs\/2110.06309"},{"key":"5001_CR6","doi-asserted-by":"crossref","unstructured":"Chen M, Zhao X (2020) A multi-scale fusion framework for bimodal speech emotion recognition. In: Interspeech, 374\u2013378","DOI":"10.21437\/Interspeech.2020-3156"},{"key":"5001_CR7","unstructured":"Clark K, Luong MT, Le QV, et\u00a0al (2020) Electra: Pre-training text encoders as discriminators rather than generators. http:\/\/arxiv.org\/abs\/2003.10555"},{"key":"5001_CR8","first-page":"87","volume":"83","author":"J Garofolo","year":"1993","unstructured":"Garofolo J, Graff D, Paul D et al (1993) Csr-i (wsj0) complete ldc93s6a. Web Download Philadelphia: Linguistic Data Consortium 83:87","journal-title":"Web Download Philadelphia: Linguistic Data Consortium"},{"key":"5001_CR9","unstructured":"Garofolo JS (1993) Timit acoustic phonetic continuous speech corpus. Linguistic Data Consortium, 1993"},{"issue":"5","key":"5001_CR10","doi-asserted-by":"publisher","first-page":"6944","DOI":"10.1007\/s11227-021-04124-5","volume":"78","author":"V Gupta","year":"2022","unstructured":"Gupta V, Juyal S, Hu YC (2022) Understanding human emotions through speech spectrograms using deep neural network. J Supercomput 78(5):6944\u20136973. https:\/\/doi.org\/10.1007\/s11227-021-04124-5","journal-title":"J Supercomput"},{"key":"5001_CR11","doi-asserted-by":"crossref","unstructured":"Howard J, Ruder S (2018) Universal language model fine-tuning for text classification. http:\/\/arxiv.org\/abs\/1801.06146","DOI":"10.18653\/v1\/P18-1031"},{"key":"5001_CR12","doi-asserted-by":"crossref","unstructured":"Jiang C, Liu J, Mao R et al (2020) Speech emotion recognition based on dcnn bigru self-attention model. 2020 International Conference on Information Science. Parallel and Distributed Systems (ISPDS), IEEE, pp 46\u201351","DOI":"10.1109\/ISPDS51347.2020.00017"},{"issue":"2","key":"5001_CR13","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1016\/S1566-2535(01)00026-4","volume":"2","author":"AL Jousselme","year":"2001","unstructured":"Jousselme AL, Grenier D, Boss\u00e9 \u00c9 (2001) A new distance between two bodies of evidence. Inf Fusion 2(2):91\u2013101","journal-title":"Inf Fusion"},{"key":"5001_CR14","unstructured":"Kenton JDMWC, Toutanova LK (2019) Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of naacL-HLT, pp 4171\u20134186"},{"issue":"5","key":"5001_CR15","doi-asserted-by":"publisher","first-page":"5019","DOI":"10.1007\/s11227-020-03468-8","volume":"77","author":"J Kommineni","year":"2021","unstructured":"Kommineni J, Mandala S, Sunar MS et al (2021) Accurate computing of facial expression recognition using a hybrid feature extraction technique. J Supercomput 77(5):5019\u20135044. https:\/\/doi.org\/10.1007\/s11227-020-03468-8","journal-title":"J Supercomput"},{"key":"5001_CR16","unstructured":"Krishna D, Patil A (2020) Multimodal emotion recognition using cross-modal attention and 1d convolutional neural networks. In: Interspeech, 4243\u20134247"},{"key":"5001_CR17","unstructured":"Lample G, Conneau A (2019) Cross-lingual language model pretraining. http:\/\/arxiv.org\/abs\/1901.07291"},{"key":"5001_CR18","unstructured":"Lan Z, Chen M, Goodman S, et\u00a0al (2019) Albert: A lite bert for self-supervised learning of language representations. http:\/\/arxiv.org\/abs\/1909.11942"},{"key":"5001_CR19","unstructured":"Liu Y, Ott M, Goyal N, et\u00a0al (2019) Roberta: A robustly optimized bert pretraining approach. http:\/\/arxiv.org\/abs\/1907.11692"},{"key":"5001_CR20","doi-asserted-by":"crossref","unstructured":"Macary M, Tahon M, Est\u00e8ve Y, et\u00a0al (2021) On the use of self-supervised pre-trained acoustic and linguistic features for continuous speech emotion recognition. In: 2021 IEEE Spoken Language Technology Workshop (SLT), IEEE, pp 373\u2013380","DOI":"10.1109\/SLT48900.2021.9383456"},{"key":"5001_CR21","doi-asserted-by":"crossref","unstructured":"Makiuchi MR, Uto K, Shinoda K (2021) Multimodal emotion recognition with high-level speech and text features. In: 2021 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU), IEEE, pp 350\u2013357","DOI":"10.1109\/ASRU51503.2021.9688036"},{"key":"5001_CR22","doi-asserted-by":"crossref","unstructured":"Mao S, Tao D, Zhang G et al (2019) Revisiting hidden markov models for speech emotion recognition. ICASSP 2019\u20132019 IEEE International Conference on Acoustics. Speech and Signal Processing (ICASSP), IEEE, pp 6715\u20136719","DOI":"10.1109\/ICASSP.2019.8683172"},{"key":"5001_CR23","volume-title":"Machine learning: a probabilistic perspective","author":"KP Murphy","year":"2012","unstructured":"Murphy KP (2012) Machine learning: a probabilistic perspective. MIT press, Cambridge"},{"key":"5001_CR24","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-022-04416-4","author":"R Nimmagadda","year":"2022","unstructured":"Nimmagadda R, Arora K, Martin MV (2022) Emotion recognition models for companion robots. J Supercomput. https:\/\/doi.org\/10.1007\/s11227-022-04416-4","journal-title":"J Supercomput"},{"key":"5001_CR25","doi-asserted-by":"crossref","unstructured":"Park DS, Chan W, Zhang Y, et\u00a0al (2019) Specaugment: a simple data augmentation method for automatic speech recognition. http:\/\/arxiv.org\/abs\/1904.08779","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"5001_CR26","doi-asserted-by":"crossref","unstructured":"Pepino L, Riera P, Ferrer L (2021) Emotion recognition from speech using wav2vec 2.0 embeddings. http:\/\/arxiv.org\/abs\/2104.03502","DOI":"10.21437\/Interspeech.2021-703"},{"key":"5001_CR27","doi-asserted-by":"crossref","unstructured":"Peters ME, Neumann M, Iyyer M, et\u00a0al (2018) Deep contextualized word representations. CoRR http:\/\/arxiv.org\/1802.05365","DOI":"10.18653\/v1\/N18-1202"},{"key":"5001_CR28","unstructured":"Radford A, Narasimhan K, Salimans T, et\u00a0al (2018) Improving language understanding by generative pre-training. OpenAI"},{"key":"5001_CR29","doi-asserted-by":"crossref","unstructured":"Rajamani ST, Rajamani KT, Mallol-Ragolta A et al (2021) A novel attention-based gated recurrent unit and its efficacy in speech emotion recognition. ICASSP 2021\u20132021 IEEE International Conference on Acoustics. Speech and Signal Processing (ICASSP), IEEE, pp 6294\u20136298","DOI":"10.1109\/ICASSP39728.2021.9414489"},{"key":"5001_CR30","doi-asserted-by":"crossref","unstructured":"Santoso J, Yamada T, Makino S, et\u00a0al (2021) Speech emotion recognition based on attention weight correction using word-level confidence measure. In: Interspeech, pp 1947\u20131951","DOI":"10.21437\/Interspeech.2021-411"},{"key":"5001_CR31","doi-asserted-by":"crossref","unstructured":"Sarma M, Ghahremani P, Povey D, et\u00a0al (2018) Emotion identification from raw speech signals using dnns. In: Interspeech, pp 3097\u20133101","DOI":"10.21437\/Interspeech.2018-1353"},{"key":"5001_CR32","doi-asserted-by":"crossref","unstructured":"Satt A, Rozenberg S, Hoory R (2017) Efficient emotion recognition from speech using deep learning on spectrograms. In: Interspeech, pp 1089\u20131093","DOI":"10.21437\/Interspeech.2017-200"},{"key":"5001_CR33","first-page":"330","volume":"1","author":"G Shafer","year":"1992","unstructured":"Shafer G (1992) Dempster-shafer theory. Encycl Artif Intell 1:330\u2013331","journal-title":"Encycl Artif Intell"},{"key":"5001_CR34","doi-asserted-by":"crossref","unstructured":"Siriwardhana S, Reis A, Weerasekera R, et\u00a0al (2020) Jointly fine-tuning\" bert-like\" self supervised models to improve multimodal speech emotion recognition. http:\/\/arxiv.org\/abs\/2008.06682","DOI":"10.21437\/Interspeech.2020-1212"},{"key":"5001_CR35","doi-asserted-by":"crossref","unstructured":"Sun C, Qiu X, Xu Y, et\u00a0al (2019) How to fine-tune bert for text classification? In: China National Conference on Chinese Computational Linguistics, Springer, pp 194\u2013206","DOI":"10.1007\/978-3-030-32381-3_16"},{"key":"5001_CR36","first-page":"4","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N et al (2017) Attention is all you need. Adv Neural Inf Process Syst 30:4","journal-title":"Adv Neural Inf Process Syst"},{"issue":"5","key":"5001_CR37","doi-asserted-by":"publisher","first-page":"6503","DOI":"10.1007\/s11227-021-04097-5","volume":"78","author":"CX Wan","year":"2022","unstructured":"Wan CX, Li B (2022) Financial causal sentence recognition based on bert-cnn text classification. J Supercomput 78(5):6503\u20136527. https:\/\/doi.org\/10.1007\/s11227-021-04097-5","journal-title":"J Supercomput"},{"issue":"5","key":"5001_CR38","doi-asserted-by":"publisher","first-page":"3211","DOI":"10.1007\/s11227-018-2554-8","volume":"76","author":"H Wang","year":"2020","unstructured":"Wang H, Wei S, Fang B (2020) Facial expression recognition using iterative fusion of mo-hog and deep features. J Supercomput 76(5):3211\u20133221. https:\/\/doi.org\/10.1007\/s11227-018-2554-8","journal-title":"J Supercomput"},{"key":"5001_CR39","unstructured":"Wang Y, Boumadane A, Heba A (2021) A fine-tuned wav2vec 2.0\/hubert benchmark for speech emotion recognition, speaker verification and spoken language understanding. http:\/\/arxiv.org\/abs\/2111.02735"},{"key":"5001_CR40","first-page":"5","volume":"32","author":"Z Yang","year":"2019","unstructured":"Yang Z, Dai Z, Yang Y et al (2019) Xlnet: Generalized autoregressive pretraining for language understanding. Adv Neural Inf Process Syst 32:5","journal-title":"Adv Neural Inf Process Syst"},{"key":"5001_CR41","doi-asserted-by":"crossref","unstructured":"Yoon S, Byun S, Jung K (2018) Multimodal speech emotion recognition using audio and text. In: 2018 IEEE Spoken Language Technology Workshop (SLT), IEEE, pp 112\u2013118","DOI":"10.1109\/SLT.2018.8639583"},{"key":"5001_CR42","first-page":"8","volume":"27","author":"J Yosinski","year":"2014","unstructured":"Yosinski J, Clune J, Bengio Y et al (2014) How transferable are features in deep neural networks? Adv Neural Inf Process Syst 27:8","journal-title":"Adv Neural Inf Process Syst"},{"key":"5001_CR43","unstructured":"Zadeh A, Zellers R, Pincus E, et\u00a0al (2016) Mosi: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. http:\/\/arxiv.org\/abs\/1606.06259"},{"key":"5001_CR44","doi-asserted-by":"crossref","unstructured":"Zadeh AB, Liang PP, Poria S, et\u00a0al (2018) Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp 2236\u20132246","DOI":"10.18653\/v1\/P18-1208"},{"issue":"4","key":"5001_CR45","doi-asserted-by":"publisher","first-page":"4681","DOI":"10.1007\/s11227-021-04058-y","volume":"78","author":"D Zhao","year":"2022","unstructured":"Zhao D, Qian Y, Liu J et al (2022) The facial expression recognition technology under image processing and neural network. J Supercomput 78(4):4681\u20134708. https:\/\/doi.org\/10.1007\/s11227-021-04058-y","journal-title":"J Supercomput"},{"key":"5001_CR46","doi-asserted-by":"crossref","unstructured":"Zheng L, Li Q, Ban H, et\u00a0al (2018) Speech emotion recognition based on convolution neural network combined with random forest. In: 2018 Chinese Control and Decision Conference (CCDC), IEEE, pp 4143\u20134147","DOI":"10.1109\/CCDC.2018.8407844"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-022-05001-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-022-05001-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-022-05001-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,4,5]],"date-time":"2023-04-05T19:37:47Z","timestamp":1680723467000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-022-05001-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,12,21]]},"references-count":46,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2023,5]]}},"alternative-id":["5001"],"URL":"https:\/\/doi.org\/10.1007\/s11227-022-05001-5","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,12,21]]},"assertion":[{"value":"4 December 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 December 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with ethical standards"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This declaration is not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}