{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:33:27Z","timestamp":1776890007600,"version":"3.51.2"},"reference-count":116,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2024A1515010112"],"award-info":[{"award-number":["2024A1515010112"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Changsha Science and Technology Bureau Foundation","award":["kq2402082"],"award-info":[{"award-number":["kq2402082"]}]},{"name":"Hunan Provincial Key Research and Development Program","award":["2024AQ2041"],"award-info":[{"award-number":["2024AQ2041"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Affective Comput."],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1109\/taffc.2024.3506554","type":"journal-article","created":{"date-parts":[[2024,11,25]],"date-time":"2024-11-25T13:47:30Z","timestamp":1732542450000},"page":"1290-1306","source":"Crossref","is-referenced-by-count":4,"title":["ParaLBench: A Large-Scale Benchmark for Computational Paralinguistics Over Acoustic Foundation Models"],"prefix":"10.1109","volume":"16","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8487-0561","authenticated-orcid":false,"given":"Zixing","family":"Zhang","sequence":"first","affiliation":[{"name":"College of Computer Science and Electronic Engineering, Hunan University, Changsha, China"}]},{"given":"Weixiang","family":"Xu","sequence":"additional","affiliation":[{"name":"College of Computer Science and Electronic Engineering, Hunan University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8074-1746","authenticated-orcid":false,"given":"Zhongren","family":"Dong","sequence":"additional","affiliation":[{"name":"College of Computer Science and Electronic Engineering, Hunan University, Changsha, China"}]},{"given":"Kanglin","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Electronic Engineering, Hunan University, Changsha, China"}]},{"given":"Yimeng","family":"Wu","sequence":"additional","affiliation":[{"name":"College of Computer Science and Electronic Engineering, Hunan University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5403-8432","authenticated-orcid":false,"given":"Jing","family":"Peng","sequence":"additional","affiliation":[{"name":"College of Computer Science and Electronic Engineering, Hunan University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9687-9918","authenticated-orcid":false,"given":"Runming","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Information Science and Engineering, Hunan Normal University, Changsha, China"}]},{"given":"Dong-Yan","family":"Huang","sequence":"additional","affiliation":[{"name":"UBTECH Robotics Corp, Shenzhen, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2012.02.005"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3037496"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446795"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.ijid.2020.07.069"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682896"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1915768117"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462579"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2009-103"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612835"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/HRI53351.2022.9889431"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054629"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3188223"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874246"},{"issue":"96","key":"ref14","first-page":"1","article-title":"openxbow\u2013introducing the passau open-source crossmodal bag-of-words toolkit","volume":"18","author":"Schmitt","year":"2017","journal-title":"J. Mach. Learn. Res."},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref18","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3235194"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3389631"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2017.2736999"},{"key":"ref22","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1002\/9781118706664"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11294"},{"key":"ref25","article-title":"The intonational system of english","author":"Liberman","year":"1975"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095623"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1097\/AUD.0000000000000776"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/s10484-006-9014-6"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1080\/00224545.1978.9924091"},{"issue":"5","key":"ref30","first-page":"477","article-title":"Gender classification in speech recognition using fuzzy logic and neural network","volume":"10","author":"Meena","year":"2013","journal-title":"Int. Arab J. Inf. Technol."},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.5121\/ijcseit.2012.2101"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ISRITI51436.2020.9315380"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2017.06.002"},{"issue":"1","key":"ref34","article-title":"A comparative study of gender and age classification in speech signals","volume":"5","author":"Sedaaghi","year":"2009","journal-title":"Iranian J. Elect. Electron. Eng."},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2024.103069"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746598"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447044"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2021.107379"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CCCI49893.2020.9256562"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0281323"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2021.114591"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2021.03.004"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1155\/2022\/6005446"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/s11235-011-9624-z"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2002-204"},{"key":"ref46","first-page":"155","article-title":"Rule-based detection of speech features for automatic speech recognition","volume-title":"Proc. Fundamentals Comput. Understanding: Speech Vis.","author":"De Mori"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1995.479684"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2003.1202279"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/2988257.2988269"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/89.861375"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP.2014.6936696"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414006"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1960"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2020EDP7196"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952552"},{"key":"ref56","article-title":"Significance of speaker embeddings and temporal context for depression detection","author":"Dumpala","year":"2021"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICOIACT50329.2020.9331995"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref60","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1016\/S1007-0214(05)70048-1"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639061"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-917"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.3390\/s20185022"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2781"},{"key":"ref66","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref67","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Baevski"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26521"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1316"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747348"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2024-2233"},{"key":"ref73","article-title":"Merbench: A unified evaluation benchmark for multimodal emotion recognition","author":"Lian","year":"2024"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.931"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1050"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_42"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p18-1208"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.30420\/456164008"},{"key":"ref81","article-title":"A database of age and gender annotated telephone speech","volume-title":"Proc. Int. Conf. Lang. Resour. Eval.","author":"Burkhardt"},{"key":"ref82","first-page":"4218","article-title":"Common voice: A massively-multilingual speech corpus","volume-title":"Proc. Lang. Resour. Eval. Conf.","author":"Ardila"},{"key":"ref83","article-title":"Timit acoustic phonetic continuous speech corpus","volume-title":"Linguistic Data Consortium","author":"Garofolo","year":"1993"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2019.101027"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-329"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref92","first-page":"119","article-title":"AudioCaps: Generating captions for audios in the wild","volume-title":"Proc. North Amer. Chapter Assoc. Comput. Linguistics: Hum. Lang. Technol.","author":"Kim"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref94","first-page":"1298","article-title":"data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Baevski"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472669"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"ref97","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1145\/3124420"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.3389\/fdgth.2021.799067"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3110146"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2021.116076"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102161"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.3390\/math10162913"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/IALP51396.2020.9310504"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00062"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413520"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p19-1455"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1145\/3381014"},{"key":"ref109","first-page":"3123","article-title":"The distress analysis interview corpus of human and computer interviews","volume-title":"Proc. Lang. Resour. Eval. Conf.","author":"Gratch"},{"key":"ref110","article-title":"Superseded-cstr vctk corpus: English multi-speaker corpus for CSTR voice cloning toolkit","author":"Veaux","year":"2016"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096700"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747272"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746698"},{"key":"ref114","article-title":"Lora: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Representations Virtual Conf.","author":"Hu"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.repl4nlp-1.26"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.3389\/frai.2021.767971"}],"container-title":["IEEE Transactions on Affective Computing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/5165369\/11152495\/10767298.pdf?arnumber=10767298","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T17:46:36Z","timestamp":1757353596000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10767298\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":116,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/taffc.2024.3506554","relation":{},"ISSN":["1949-3045","2371-9850"],"issn-type":[{"value":"1949-3045","type":"electronic"},{"value":"2371-9850","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7]]}}}