{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T15:09:39Z","timestamp":1773155379786,"version":"3.50.1"},"reference-count":20,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"11","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2024,11,1]]},"DOI":"10.1587\/transinf.2024edl8034","type":"journal-article","created":{"date-parts":[[2024,7,21]],"date-time":"2024-07-21T22:12:02Z","timestamp":1721599922000},"page":"1463-1467","source":"Crossref","is-referenced-by-count":1,"title":["Multimodal Speech Emotion Recognition Based on Large Language Model"],"prefix":"10.1587","volume":"E107.D","author":[{"given":"Congcong","family":"FANG","sequence":"first","affiliation":[{"name":"School of Physics and Electronic Engineering, Jiangsu Normal University"}]},{"given":"Yun","family":"JIN","sequence":"additional","affiliation":[{"name":"School of Physics and Electronic Engineering, Jiangsu Normal University"}]},{"given":"Guanlin","family":"CHEN","sequence":"additional","affiliation":[{"name":"School of Physics and Electronic Engineering, Jiangsu Normal University"}]},{"given":"Yunfan","family":"ZHANG","sequence":"additional","affiliation":[{"name":"School of Physics and Electronic Engineering, Jiangsu Normal University"}]},{"given":"Shidang","family":"LI","sequence":"additional","affiliation":[{"name":"School of Physics and Electronic Engineering, Jiangsu Normal University"}]},{"given":"Yong","family":"MA","sequence":"additional","affiliation":[{"name":"School of Linguistic Sciences and Arts, Jiangsu Normal University"}]},{"given":"Yue","family":"XIE","sequence":"additional","affiliation":[{"name":"School of Information and Communication Engineering, Nanjing Institute of Technology"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] R.W. Picard, Affective Computing, MIT Press, 1997. 10.7551\/mitpress\/1140.001.0001","DOI":"10.7551\/mitpress\/1140.001.0001"},{"key":"2","unstructured":"[2] Y.H. Zhang and X.Z. Lin, \u201cEmotion can be calculated: A review of emotion computing,\u201d Computer Science, vol.35, no.5, 4, 2008."},{"key":"3","doi-asserted-by":"publisher","unstructured":"[3] S. Liu, M. Zhang, M. Fang, J. Zhao, K. Hou, and C.-C. Hung, \u201cSpeech emotion recognition based on transfer learning from the FaceNet framework,\u201d, The Journal of the Acoustical Society of America, vol.149, no.2, pp.1338-1345, 2021, 10.1121\/10.0003530","DOI":"10.1121\/10.0003530"},{"key":"4","doi-asserted-by":"publisher","unstructured":"[4] D. Issa, M.F. Demirci, and A. Yazici, \u201cSpeech emotion recognition with deep convolutional neural networks,\u201d Biomedical Signal Processing and Control, vol.59, 101894, 2020. 10.1016\/j.bspc.2020.101894","DOI":"10.1016\/j.bspc.2020.101894"},{"key":"5","doi-asserted-by":"publisher","unstructured":"[5] E. Batbaatar, M. Li, and K.H. Ryu, \u201cSemantic-emotion neural network for emotion recognition from text,\u201d IEEE Access, vol.7, pp.111866-111878, 2019. 10.1109\/ACCESS.2019.2934529","DOI":"10.1109\/ACCESS.2019.2934529"},{"key":"6","doi-asserted-by":"publisher","unstructured":"[6] S. Poria, N. Majumder, D. Hazarika, E. Cambria, A. Gelbukh, and A. Hussain, \u201cMultimodal sentiment analysis: Addressing key issues and setting up the baselines,\u201d IEEE Intelligent Systems, vol.33, no.6, pp.17-25, 2018. 10.1109\/MIS.2018.2882362","DOI":"10.1109\/MIS.2018.2882362"},{"key":"7","unstructured":"[7] G. Sahu, \u201cMultimodal speech emotion recognition and ambiguity resolution,\u201d CoRR, abs\/1904.06022, 2019."},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] Z. Peng, Y. Lu, S. Pan, and Y. Liu, \u201cEfficient speech emotion recognition using multi-scale CNN and attention,\u201d ICASSP 2021, 2021 IEEE International Conference on Acoustics, Speech and Signal Processing, Toronto, ON, Canada, pp.3020-3024, IEEE, 2021. 10.1109\/icassp39728.2021.9414286","DOI":"10.1109\/ICASSP39728.2021.9414286"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] B. Li, D. Dimitriadis, and A. Stolcke, \u201cAcoustic and lexical sentiment analysis for customer service calls,\u201d Proc. 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.5876-5880, IEEE, 2019. 10.1109\/ICASSP.2019.8683679","DOI":"10.1109\/ICASSP.2019.8683679"},{"key":"10","unstructured":"[10] T. Brown, B. Mann, N. Ryder, M. Subbiah, J.D. Kaplan, P. Dhariwal, A. Neelakantan, P. Shyam, G. Sastry, A. Askell, S. Agarwal, A. Herbert-Voss, G. Krueger, T. Henighan, R. Child, A. Ramesh, D. Ziegler, J. Wu, C. Winter, C. Hesse, M. Chen, E. Sigler, M. Litwin, S. Gray, B. Chess, J. Clark, C. Berner, S. McCandlish, A. Radford, I. Sutskever, and D. Amodei, \u201cLanguage models are few-shot learners,\u201d Advances in Neural Information Processing Systems, vol.33, pp.1877-1901, 2020."},{"key":"11","doi-asserted-by":"publisher","unstructured":"[11] M. Chen, X. He, J. Yang, and H. Zhang, \u201c3-D convolutional recurrent neural networks with attention model for speech emotion recognition,\u201d IEEE Signal Process. Lett., vol.25, no.10, pp.1440-1444, 2018. 10.1109\/lsp.2018.2860246","DOI":"10.1109\/LSP.2018.2860246"},{"key":"12","unstructured":"[12] A. Neelakantan, T. Xu, R. Puri, A. Radford, J.M. Han, J. Tworek, Q. Yuan, N. Tezak, J.W. Kim, C.Hallacy, J. Heidecke, P. Shyam, B. Power, T.E. Nekoul, G. Sastry, G. Krueger, D. Schnurr, F.P. Such, K. Hsu, M. Thompson, T. Khan, T. Sherbakov, J. Jang, P. Welinder, and L. Weng, \u201cText and code embeddings by contrastive pre-training,\u201d arXiv preprint arXiv:2201.10005, 2022. 10.48550\/arXiv.2201.10005"},{"key":"13","unstructured":"[13] Y. Cao, S. Li, Y. Liu, Z. Yan, Y. Dai, P.S. Yu, and L. Sun, \u201cA comprehensive survey of AI-generated content (AIGC): A history of generative AI from GAN to ChatGPT,\u201d arXiv preprint arXiv:2303.04226, 2023. 10.48550\/arXiv.2303.04226"},{"key":"14","doi-asserted-by":"crossref","unstructured":"[14] F. Eyben, M. W\u00f6llmer, ad B. Schuller, \u201cOpensmile: The Munich versatile and fast open-source audio feature extractor,\u201d Proc. 18th ACM International Conference on Multimedia, pp.1459-1462, 2010. 10.1145\/1873951.1874246","DOI":"10.1145\/1873951.1874246"},{"key":"15","doi-asserted-by":"publisher","unstructured":"[15] C. Busso, M. Bulut, C.-C. Lee, A. Kazemzadeh, E. Mower, S. Kim, J.N. Chang, S. Lee, and S.S. Narayanan, \u201cIEMOCAP: Interactive emotional dyadic motion capture database,\u201d Language Resources and Evaluation, vol.42, no.4, pp.335-359, 2008. 10.1007\/s10579-008-9076-6","DOI":"10.1007\/s10579-008-9076-6"},{"key":"16","doi-asserted-by":"crossref","unstructured":"[16] Y. Li, T. Zhao, and T. Kawahara, \u201cImproved end-to-end speech emotion recognition using self attention mechanism and multitask learning,\u201d Interspeech 2019, pp.2803-2807, ISCA, 2019. 10.21437\/interspeech.2019-2594","DOI":"10.21437\/Interspeech.2019-2594"},{"key":"17","unstructured":"[17] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, \u0141. Kaiser, and I. Polosukhin, \u201cAttention is all you need,\u201d Advances in Neural Information Processing Systems, vol.33, 2017."},{"key":"18","doi-asserted-by":"publisher","unstructured":"[18] C. Fan, J. Lin, R. Mao, and E. Cambria, \u201cFusing pairwise modalities for emotion recognition in conversations,\u201d Information Fusion, vol.106, 102306, 2024. 10.1016\/j.inffus.2024.102306","DOI":"10.1016\/j.inffus.2024.102306"},{"key":"19","doi-asserted-by":"publisher","unstructured":"[19] N. Lu, Z. Han, M. Han, and J. Qian, \u201cBi-stream graph learning based multimodal fusion for emotion recognition in conversation,\u201d Information Fusion, vol.106, 102272, 2024. 10.1016\/j.inffus.2024.102272","DOI":"10.1016\/j.inffus.2024.102272"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] W. Chen, X. Xing, X. Xu, J. Yang, and J. Pang, \u201cKey-sparse transformer for multimodal speech emotion recognition,\u201d ICASSP 2022, 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Singapore, Singapore, pp.6897-6901, 2022. 10.1109\/icassp43922.2022.9746598","DOI":"10.1109\/ICASSP43922.2022.9746598"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E107.D\/11\/E107.D_2024EDL8034\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T03:43:15Z","timestamp":1730518995000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E107.D\/11\/E107.D_2024EDL8034\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"references-count":20,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2024]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2024edl8034","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"article-number":"2024EDL8034"}}