{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T07:03:55Z","timestamp":1779865435586,"version":"3.53.1"},"reference-count":38,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Biomedical Signal Processing and Control"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1016\/j.bspc.2026.110504","type":"journal-article","created":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T14:07:58Z","timestamp":1778249278000},"page":"110504","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["Saliency-driven neural network for generalizable speech emotion recognition"],"prefix":"10.1016","volume":"123","author":[{"given":"Wenkui","family":"Zheng","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gaigai","family":"Tang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhu","family":"Yuan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9672-1539","authenticated-orcid":false,"given":"Huiyun","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Donghan","family":"Hou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Puyang","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Heming","family":"Huang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.bspc.2026.110504_b0005","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110794","article-title":"A survey of dialogic emotion analysis: developments, approaches and perspectives","volume":"156","author":"Gan","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.bspc.2026.110504_b0010","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110261","article-title":"EmoComicNet: a multi-task model for comic emotion recognition","volume":"150","author":"Dutta","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.bspc.2026.110504_b0015","doi-asserted-by":"crossref","DOI":"10.1016\/j.jnca.2019.102423","article-title":"A survey of emotion recognition methods with emphasis on E-learning environments","volume":"147","author":"Imani","year":"2019","journal-title":"J. Netw. Comput. Appl."},{"key":"10.1016\/j.bspc.2026.110504_b0020","doi-asserted-by":"crossref","first-page":"2043","DOI":"10.1109\/TASLP.2023.3277291","article-title":"Wavelet multiresolution analysis-based speech emotion recognition system using 1D CNN LSTM networks","volume":"31","author":"Dutt","year":"2023","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"5","key":"10.1016\/j.bspc.2026.110504_b0025","doi-asserted-by":"crossref","first-page":"1459","DOI":"10.1016\/j.jnca.2010.08.007","article-title":"Robust several-speaker speech recognition with highly dependable online speaker adaptation and identification","volume":"34","author":"Shih","year":"2011","journal-title":"J. Netw. Comput. Appl."},{"issue":"2","key":"10.1016\/j.bspc.2026.110504_b0030","doi-asserted-by":"crossref","first-page":"177","DOI":"10.1109\/81.904882","article-title":"A comparison of waveform fractal dimension algorithms","volume":"48","author":"Esteller","year":"2001","journal-title":"IEEE Trans. Circuits Syst. I: Fundam. Theory Appl."},{"key":"10.1016\/j.bspc.2026.110504_b0035","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.110200","article-title":"IIOF: intra- and Inter-feature orthogonal fusion of local and global features for music emotion recognition","volume":"148","author":"Chang","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.bspc.2026.110504_b0040","doi-asserted-by":"crossref","first-page":"176958","DOI":"10.1109\/ACCESS.2025.3616056","article-title":"Encouraging discriminative attention through contrastive explainability learning for lung cancer diagnosis","volume":"13","author":"Shravya","year":"2025","journal-title":"IEEE Access"},{"key":"10.1016\/j.bspc.2026.110504_b0045","series-title":"Proc. ICASSP, Barcelona, Spain","first-page":"7179","article-title":"Multi-head attention for speech emotion recognition with auxiliary learning of gender recognition","author":"Nediyanchath","year":"2020"},{"key":"10.1016\/j.bspc.2026.110504_b0050","doi-asserted-by":"crossref","DOI":"10.1016\/j.bspc.2024.107268","article-title":"Lung image quality assessment and diagnosis using generative autoencoders in unsupervised ensemble learning","volume":"102","author":"Rajasekar","year":"2025","journal-title":"Biomed. Signal Process. Control"},{"key":"10.1016\/j.bspc.2026.110504_b0055","series-title":"Proc ICASSP, Seoul, Korea","first-page":"12271","article-title":"MS-SENet: Enhancing speech emotion recognition through multi-scale feature fusion with squeeze-and-excitation blocks","author":"Li","year":"2024"},{"issue":"2","key":"10.1016\/j.bspc.2026.110504_b0060","doi-asserted-by":"crossref","first-page":"293","DOI":"10.1109\/TSA.2004.838534","article-title":"Toward detecting emotions in spoken dialogs","volume":"13","author":"Lee","year":"2005","journal-title":"IEEE Trans. Speech Audio Process."},{"key":"10.1016\/j.bspc.2026.110504_b0065","doi-asserted-by":"crossref","first-page":"668","DOI":"10.1016\/j.patcog.2018.12.016","article-title":"Attention-based convolutional neural network and long short-term memory for short-term detection of mood disorders based on elicited speech responses","volume":"88","author":"Huang","year":"2019","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.bspc.2026.110504_b0070","doi-asserted-by":"crossref","first-page":"2534","DOI":"10.1109\/TASLP.2023.3289312","article-title":"Music theory-inspired acoustic representation for speech emotion recognition","volume":"31","author":"Li","year":"2023","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"2","key":"10.1016\/j.bspc.2026.110504_b0075","doi-asserted-by":"crossref","first-page":"1098","DOI":"10.1109\/TAFFC.2021.3111110","article-title":"Enforcing semantic consistency for cross-corpus emotion prediction using adversarial discrepancy learning in emotion","volume":"14","author":"Chang","year":"2023","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"3","key":"10.1016\/j.bspc.2026.110504_b0080","doi-asserted-by":"crossref","first-page":"393","DOI":"10.1109\/TAFFC.2018.2803178","article-title":"Detecting unipolar and bipolar depressive disorders from elicited speech responses using latent affective structure model","volume":"11","author":"Huang","year":"2020","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"3","key":"10.1016\/j.bspc.2026.110504_b0085","doi-asserted-by":"crossref","first-page":"405","DOI":"10.1109\/TAFFC.2018.2805892","article-title":"Low-level characterization of expressive head motion through frequency domain analysis","volume":"11","author":"Ding","year":"2020","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"2","key":"10.1016\/j.bspc.2026.110504_b0090","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1109\/JSTSP.2019.2955012","article-title":"Automatic assessment of depression from speech via a hierarchical attention transfer network and attention autoencoders","volume":"14","author":"Zhao","year":"2020","journal-title":"IEEE J. Sel. Top. Signal Process."},{"key":"10.1016\/j.bspc.2026.110504_b0095","doi-asserted-by":"crossref","first-page":"2617","DOI":"10.1109\/TASLP.2021.3096037","article-title":"Information fusion in attention networks using adaptive and multi-level factorized bilinear pooling for audio-visual emotion recognition","volume":"29","author":"Zhou","year":"2021","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"issue":"9","key":"10.1016\/j.bspc.2026.110504_b0100","doi-asserted-by":"crossref","first-page":"10745","DOI":"10.1109\/TPAMI.2023.3263585","article-title":"Dawn of the Transformer era in speech emotion recognition: closing the valence gap","volume":"45","author":"Triantafyllopoulos","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.bspc.2026.110504_b0105","series-title":"Proc. ICASSP","first-page":"6897","article-title":"Key-sparse Transformer for multimodal speech emotion recognition","author":"Chen","year":"2022"},{"key":"10.1016\/j.bspc.2026.110504_b0110","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2021.107868","article-title":"Multi-task learning for gait-based identity recognition and emotion recognition using attention enhanced temporal graph convolutional network","volume":"114","author":"Sheng","year":"2021","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.bspc.2026.110504_b0115","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.110117","article-title":"MSA-GCN: multiscale adaptive graph convolution network for gait emotion recognition","volume":"147","author":"Yin","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.bspc.2026.110504_b0120","first-page":"15288","article-title":"Calibrating deep neural networks using focal loss","volume":"33","author":"Mukhoti","year":"2020","journal-title":"Proc. NIPS"},{"key":"10.1016\/j.bspc.2026.110504_b0125","series-title":"Proc. ICASSP, Toronto, ON, Canada","first-page":"3790","article-title":"Cross-corpus speech emotion recognition using joint distribution adaptive regression","author":"Zhang","year":"2021"},{"issue":"2","key":"10.1016\/j.bspc.2026.110504_b0130","doi-asserted-by":"crossref","first-page":"196","DOI":"10.1109\/TAFFC.2017.2702653","article-title":"ISLA: Temporal segmentation and labeling for audio-visual emotion recognition","volume":"10","author":"Kim","year":"2019","journal-title":"IEEE Trans. Affect. Comput."},{"key":"10.1016\/j.bspc.2026.110504_b0135","doi-asserted-by":"crossref","first-page":"2193","DOI":"10.1109\/TASLP.2023.3282092","article-title":"Dual-TBNet: improving the robustness of speech features via dual-transformer-BiLSTM for speech emotion recognition","volume":"31","author":"Liu","year":"2023","journal-title":"IEEE\/ACM Trans. Audio Speech Lang. Process."},{"key":"10.1016\/j.bspc.2026.110504_b0140","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112499","article-title":"CENN: Capsule-enhanced neural network with innovative metrics for robust speech emotion recognition","volume":"304","author":"Zhang","year":"2024","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.bspc.2026.110504_b0145","doi-asserted-by":"crossref","first-page":"238","DOI":"10.1016\/j.neucom.2021.02.094","article-title":"Spatiotemporal and requential cascaded attention networks for speech emotion recognition","volume":"448","author":"Li","year":"2021","journal-title":"Neurocomputing"},{"issue":"43","key":"10.1016\/j.bspc.2026.110504_b0150","doi-asserted-by":"crossref","first-page":"32917","DOI":"10.1007\/s11042-020-09693-w","article-title":"Speech emotion recognition using convolutional neural network and long-short termmemory","volume":"79","author":"Dangol","year":"2020","journal-title":"Multimed. Tools Appl."},{"issue":"10","key":"10.1016\/j.bspc.2026.110504_b0155","doi-asserted-by":"crossref","first-page":"1440","DOI":"10.1109\/LSP.2018.2860246","article-title":"3-D convolutional recurrent neural networks with attention model for speech emotion recognition","volume":"25","author":"Chen","year":"2018","journal-title":"IEEE Signal Process Lett."},{"issue":"8","key":"10.1016\/j.bspc.2026.110504_b0160","doi-asserted-by":"crossref","first-page":"2297","DOI":"10.3390\/s20082297","article-title":"Speech emotion recognition based on selective interpolation synthetic minority oversampling technique in small sample environment","volume":"20","author":"Liu","year":"2020","journal-title":"Sensors"},{"key":"10.1016\/j.bspc.2026.110504_b0165","series-title":"Proc. EUSIPCO, Amsterdam, Netherlands","first-page":"1","article-title":"An end-to-end multitask learning model to improve speech emotion recognition","author":"Fu","year":"2021"},{"key":"10.1016\/j.bspc.2026.110504_b0170","series-title":"Proc. INTERSPEECH","article-title":"A database of German emotional speech","author":"Burkhardt","year":"2005"},{"key":"10.1016\/j.bspc.2026.110504_b0175","doi-asserted-by":"crossref","unstructured":"R. Barkur, D. Deepanshi, K. Suresh et al., \u201cEnsembleWave: An ensembled approach for automatic speech emotion recognition,\u201d in Proc. CONECCT, Bangalore, India, 2022, pp. 1-6.","DOI":"10.1109\/CONECCT55679.2022.9865696"},{"key":"10.1016\/j.bspc.2026.110504_b0180","first-page":"1","article-title":"Decision tree SVM model with Fisher feature selection for speech emotion recognition","volume":"1","author":"Sun","year":"2019","journal-title":"EURASIP J. Audio Speech Music Process."},{"key":"10.1016\/j.bspc.2026.110504_b0185","doi-asserted-by":"crossref","first-page":"150","DOI":"10.1016\/j.ins.2019.09.005","article-title":"Two-layer fuzzy multiple random forest for speech emotion recognition in human-robot interaction","volume":"509","author":"Chen","year":"2020","journal-title":"Inf. Sci."},{"key":"10.1016\/j.bspc.2026.110504_b0190","doi-asserted-by":"crossref","unstructured":"S. Cheng, D. Zhang, and D. Yin, \u201cA DenseNet-GRU technology for chinese speech emotion recognition,\u201d in Proc. ICFEICT, New York, NY, United States, 2021, pp. 1-7.","DOI":"10.1145\/3474198.3478152"}],"container-title":["Biomedical Signal Processing and Control"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S174680942601058X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S174680942601058X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T06:20:21Z","timestamp":1779862821000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S174680942601058X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,9]]},"references-count":38,"alternative-id":["S174680942601058X"],"URL":"https:\/\/doi.org\/10.1016\/j.bspc.2026.110504","relation":{},"ISSN":["1746-8094"],"issn-type":[{"value":"1746-8094","type":"print"}],"subject":[],"published":{"date-parts":[[2026,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Saliency-driven neural network for generalizable speech emotion recognition","name":"articletitle","label":"Article Title"},{"value":"Biomedical Signal Processing and Control","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.bspc.2026.110504","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"110504"}}