{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,22]],"date-time":"2026-06-22T08:47:41Z","timestamp":1782118061452,"version":"3.54.5"},"reference-count":29,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,4,20]],"date-time":"2026-04-20T00:00:00Z","timestamp":1776643200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100008530","name":"ERDF","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100008530","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100011033","name":"State Agency of Research","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100011033","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["SoftwareX"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.softx.2026.102677","type":"journal-article","created":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T01:58:35Z","timestamp":1777600715000},"page":"102677","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["speech-emotion: A multilingual and multimodal toolkit for emotion recognition from speech"],"prefix":"10.1016","volume":"34","author":[{"given":"Ronghao","family":"Pan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tom\u00e1s","family":"Bernal-Beltr\u00e1n","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jos\u00e9 Antonio","family":"Garc\u00eda-D\u00edaz","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2457-1791","authenticated-orcid":false,"given":"Rafael","family":"Valencia-Garc\u00eda","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.softx.2026.102677_bib0005","doi-asserted-by":"crossref","DOI":"10.1016\/j.compeleceng.2021.107280","article-title":"A human\u2013computer interaction framework for emotion recognition through time-series thermal video sequences","volume":"93","author":"Nayak","year":"2021","journal-title":"Comput Electr Eng"},{"issue":"2","key":"10.1016\/j.softx.2026.102677_bib0010","doi-asserted-by":"crossref","first-page":"1675","DOI":"10.1109\/TAFFC.2021.3128787","article-title":"Automatic emotion recognition in clinical scenario: a systematic review of methods","volume":"14","author":"Pepa","year":"2023","journal-title":"IEEE Trans Affect Comput"},{"key":"10.1016\/j.softx.2026.102677_bib0015","doi-asserted-by":"crossref","first-page":"5570","DOI":"10.1016\/j.promfg.2015.07.738","article-title":"Assistive technology and user-centered design: emotion as element for innovation","volume":"3","author":"Mallin","year":"2015","journal-title":"Procedia Manuf"},{"key":"10.1016\/j.softx.2026.102677_bib0020","series-title":"Interspeech","first-page":"2241","article-title":"Two-stream emotion recognition for call center monitoring","volume":"vol. 7","author":"Gupta","year":"2007"},{"key":"10.1016\/j.softx.2026.102677_bib0025","doi-asserted-by":"crossref","first-page":"19","DOI":"10.1016\/j.inffus.2022.03.009","article-title":"A systematic review on affective computing: emotion models, databases, and recent advances","volume":"83-84","author":"Wang","year":"2022","journal-title":"Inf Fusion"},{"key":"10.1016\/j.softx.2026.102677_bib0030","doi-asserted-by":"crossref","first-page":"36018","DOI":"10.1109\/ACCESS.2022.3163856","article-title":"Hybrid LSTM-transformer model for emotion recognition from speech audio files","volume":"10","author":"Andayani","year":"2022","journal-title":"IEEE Access"},{"issue":"8","key":"10.1016\/j.softx.2026.102677_bib0035","doi-asserted-by":"crossref","first-page":"5789","DOI":"10.1007\/s10462-021-09958-2","article-title":"Transformer models for text-based emotion detection: a review of BERT-based approaches","volume":"54","author":"Acheampong","year":"2021","journal-title":"Artif Intell Rev"},{"key":"10.1016\/j.softx.2026.102677_bib0040","series-title":"Findings of the association for computational linguistics: EMNLP 2025","first-page":"6257","article-title":"Multimodal emotion recognition in conversations: a survey of methods, trends, challenges and prospects","author":"Wu","year":"2025"},{"key":"10.1016\/j.softx.2026.102677_bib0045","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2025.112958","article-title":"Sentiment analysis and emotion recognition in social media: a comprehensive survey","volume":"174","author":"Bachate","year":"2025","journal-title":"Appl Soft Comput"},{"key":"10.1016\/j.softx.2026.102677_bib0050","doi-asserted-by":"crossref","first-page":"47795","DOI":"10.1109\/ACCESS.2021.3068045","article-title":"A comprehensive review of speech emotion recognition systems","volume":"9","author":"Wani","year":"2021","journal-title":"IEEE Access"},{"key":"10.1016\/j.softx.2026.102677_bib0055","article-title":"A systematic survey on multimodal emotion recognition using learning algorithms","volume":"17","author":"Ahmed","year":"2023","journal-title":"Intell Syst Appl"},{"key":"10.1016\/j.softx.2026.102677_bib0060","series-title":"ICASSP 2023-2023 IEEE international conference on acoustics, speech and signal processing (ICASSP)","first-page":"1","article-title":"Exploring wav2vec 2.0 fine tuning for improved speech emotion recognition","author":"Chen","year":"2023"},{"key":"10.1016\/j.softx.2026.102677_bib0065","doi-asserted-by":"crossref","DOI":"10.1016\/j.csi.2024.103856","article-title":"Spanish MEACorpus 2023: A multimodal speech\u2013text corpus for emotion analysis in Spanish from natural environments","volume":"90","author":"Pan","year":"2024","journal-title":"Comput Stand Interfaces"},{"key":"10.1016\/j.softx.2026.102677_bib0070","first-page":"359","article-title":"Overview of EmoSPeech at IberLEF 2024: multimodal speech-text emotion recognition in Spanish","volume":"73","author":"Pan","year":"2024","journal-title":"Proces Leng Nat"},{"key":"10.1016\/j.softx.2026.102677_bib0075","author":"L\u2019Huillier"},{"key":"10.1016\/j.softx.2026.102677_bib0080","author":"Gupta"},{"key":"10.1016\/j.softx.2026.102677_bib0085","series-title":"2021 IEEE automatic speech recognition and understanding workshop (ASRU)","first-page":"244","article-title":"w2v-BERT: combining contrastive learning and masked language modeling for self-supervised speech pre-training","author":"Chung","year":"2021"},{"key":"10.1016\/j.softx.2026.102677_bib0090","series-title":"Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.softx.2026.102677_bib0095","series-title":"Proceedings of the 20th Chinese national conference on computational linguistics","first-page":"1218","article-title":"A robustly optimized BERT pre-training approach with post-training","author":"Zhuang","year":"2021"},{"key":"10.1016\/j.softx.2026.102677_bib0100","author":"Barrault"},{"key":"10.1016\/j.softx.2026.102677_bib0105","series-title":"Pml4dc at ICLR 2020","article-title":"Spanish pre-trained BERT model and evaluation data","author":"Ca\u00f1ete","year":"2020"},{"key":"10.1016\/j.softx.2026.102677_bib0110","author":"Liu"},{"issue":"5","key":"10.1016\/j.softx.2026.102677_bib0115","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1371\/journal.pone.0196391","article-title":"The Ryerson audio-visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in North American English","volume":"13","author":"Livingstone","year":"2018","journal-title":"PLoS One"},{"key":"10.1016\/j.softx.2026.102677_bib0120","series-title":"2024 5th international conference on innovative trends in information technology (ICITIIT)","first-page":"1","article-title":"Emotion recognition from speech\u2013an LSTM approach with the tess dataset","author":"Pandiammal","year":"2024"},{"key":"10.1016\/j.softx.2026.102677_bib0125","series-title":"Combining frame and turn-level information for robust recognition of emotions within speech","first-page":"2249","author":"Vlasenko","year":"2007"},{"key":"10.1016\/j.softx.2026.102677_bib0130","series-title":"Proceedings of the 2018 conference on empirical methods in natural language processing","first-page":"3687","article-title":"CARER: contextualized affect representations for emotion recognition","author":"Saravia","year":"2018"},{"key":"10.1016\/j.softx.2026.102677_bib0135","series-title":"Proceedings of the 58th annual meeting of the association for computational linguistics","first-page":"4040","article-title":"GoEmotions: a dataset of fine-grained emotions","author":"Demszky","year":"2020"},{"key":"10.1016\/j.softx.2026.102677_bib0140","series-title":"International survey on emotion antecedents and reactions (ISEAR)","author":"Scherer","year":"1990"},{"key":"10.1016\/j.softx.2026.102677_bib0145","series-title":"Proceedings of the 57th annual meeting of the association for computational linguistics","first-page":"527","article-title":"MELD: a multimodal multi-party dataset for emotion recognition in conversations","author":"Poria","year":"2019"}],"container-title":["SoftwareX"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S235271102600169X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S235271102600169X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,22]],"date-time":"2026-06-22T07:55:09Z","timestamp":1782114909000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S235271102600169X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":29,"alternative-id":["S235271102600169X"],"URL":"https:\/\/doi.org\/10.1016\/j.softx.2026.102677","relation":{},"ISSN":["2352-7110"],"issn-type":[{"value":"2352-7110","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"speech-emotion: A multilingual and multimodal toolkit for emotion recognition from speech","name":"articletitle","label":"Article Title"},{"value":"SoftwareX","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.softx.2026.102677","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Author(s). Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"102677"}}