{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T18:41:55Z","timestamp":1765392115928,"version":"3.46.0"},"reference-count":53,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/access.2025.3636123","type":"journal-article","created":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T18:44:20Z","timestamp":1763750660000},"page":"205601-205611","source":"Crossref","is-referenced-by-count":0,"title":["EmoBridge: Aligning Speech and Language for Emotion Recognition via Q-Former"],"prefix":"10.1109","volume":"13","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6348-8238","authenticated-orcid":false,"given":"Yuntao","family":"Sun","sequence":"first","affiliation":[{"name":"Shandong College of Electronic Technology, Jinan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuehua","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shandong College of Electronic Technology, Jinan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanting","family":"Sun","sequence":"additional","affiliation":[{"name":"Shandong College of Electronic Technology, Jinan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.21437\/Interspeech.2024-788"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.1109\/ICASSP43922.2022.9747095"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/TASLP.2023.3235194"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.21437\/interspeech.2023-1170"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1109\/ICASSP48485.2024.10446974"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.21437\/interspeech.2024-427"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.21437\/Interspeech.2023-819"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.21437\/Interspeech.2023-1236"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.1109\/ICASSP48485.2024.10448130"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1016\/j.iswa.2024.200436"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.1016\/j.eswa.2023.122946"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.3389\/fnbot.2023.1181598"},{"key":"ref13","article-title":"Contrastive regularization for multimodal emotion recognition using audio and text","author":"Qian","year":"2022","journal-title":"arXiv:2211.10885"},{"doi-asserted-by":"publisher","key":"ref14","DOI":"10.1109\/SMC53992.2023.10394418"},{"key":"ref15","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"doi-asserted-by":"publisher","key":"ref16","DOI":"10.1109\/TAFFC.2023.3340924"},{"key":"ref17","article-title":"Qwen3 technical report","volume-title":"arXiv:2505.09388","author":"Yang","year":"2025"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1093\/nsr\/nwae403"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1007\/978-981-96-2071-5_24"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1109\/ICASSP49660.2025.10889156"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1109\/ICASSP49660.2025.10889485"},{"issue":"2","key":"ref22","first-page":"276","article-title":"Deep fusion: Integrating acoustic and lexical features for multimodal emotion recognition","volume":"10","author":"Zhao","year":"2019","journal-title":"IEEE Trans. Affect. Comput."},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.18653\/v1\/D17-1115"},{"key":"ref24","first-page":"47","article-title":"Deep belief networks for audio-visual emotion recognition","volume-title":"Proc. IEEE Int. Conf. Multimodal Interact. (ICMI)","author":"Kim"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref26","article-title":"SALMONN: Towards generic hearing abilities for large language models","author":"Tang","year":"2023","journal-title":"arXiv:2310.13289"},{"volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Hu","article-title":"LoRA: Low-rank adaptation of large language models","key":"ref27"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"volume-title":"Proc. Conf. Neural Inf. Process. Syst. (NeurIPS)","author":"Touvron","article-title":"LLaMA: Open and efficient foundation language models","key":"ref29"},{"key":"ref30","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Zhang","article-title":"OPT: Open pretrained transformer language models","key":"ref31"},{"key":"ref32","article-title":"Bloom: A 176B-parameter open-access multilingual language model","author":"Workshop","year":"2022","journal-title":"arXiv:2211.05100"},{"doi-asserted-by":"publisher","key":"ref33","DOI":"10.1145\/3604237.3626866"},{"key":"ref34","article-title":"InstructERC: Reforming emotion recognition in conversation with multi-task retrieval-augmented large language models","author":"Lei","year":"2023","journal-title":"arXiv:2309.11911"},{"key":"ref35","article-title":"DialogueLLM: Context and emotion knowledge-tuned large language models for emotion recognition in conversations","author":"Zhang","year":"2023","journal-title":"arXiv:2310.11374"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.1016\/j.inffus.2017.02.003"},{"doi-asserted-by":"publisher","key":"ref37","DOI":"10.1016\/j.inffus.2023.101920"},{"doi-asserted-by":"publisher","key":"ref38","DOI":"10.1609\/aaai.v32i1.12021"},{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.18653\/v1\/P19-1656"},{"doi-asserted-by":"publisher","key":"ref40","DOI":"10.1109\/CVPR52729.2023.01821"},{"doi-asserted-by":"publisher","key":"ref41","DOI":"10.1109\/FG59268.2024.10581982"},{"doi-asserted-by":"publisher","key":"ref42","DOI":"10.1007\/s10579-008-9076-6"},{"key":"ref43","article-title":"MELD: A multimodal multi-party dataset for emotion recognition in conversations","author":"Poria","year":"2018","journal-title":"arXiv:1810.02508"},{"doi-asserted-by":"publisher","key":"ref44","DOI":"10.1109\/TASLP.2021.3122291"},{"doi-asserted-by":"publisher","key":"ref45","DOI":"10.48550\/ARXIV.1907.11692"},{"key":"ref46","article-title":"Qwen2.5-coder technical report","volume-title":"arXiv:2409.12186","author":"Hui","year":"2024"},{"key":"ref47","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017","journal-title":"arXiv:1711.05101"},{"doi-asserted-by":"publisher","key":"ref48","DOI":"10.1109\/TAFFC.2024.3369726"},{"doi-asserted-by":"publisher","key":"ref49","DOI":"10.21437\/Interspeech.2024-1983"},{"doi-asserted-by":"publisher","key":"ref50","DOI":"10.1109\/ICME59968.2025.11209040"},{"doi-asserted-by":"publisher","key":"ref51","DOI":"10.1109\/ICASSP49357.2023.10096966"},{"doi-asserted-by":"publisher","key":"ref52","DOI":"10.1109\/ACCESS.2025.3542948"},{"doi-asserted-by":"publisher","key":"ref53","DOI":"10.21437\/Interspeech.2024-651"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/10820123\/11263794.pdf?arnumber=11263794","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T18:34:35Z","timestamp":1765391675000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11263794\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":53,"URL":"https:\/\/doi.org\/10.1109\/access.2025.3636123","relation":{},"ISSN":["2169-3536"],"issn-type":[{"type":"electronic","value":"2169-3536"}],"subject":[],"published":{"date-parts":[[2025]]}}}