{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T16:26:38Z","timestamp":1780676798875,"version":"3.54.1"},"reference-count":69,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Science and Technology Council","award":["111-2221-E-006-150-MY3"],"award-info":[{"award-number":["111-2221-E-006-150-MY3"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Affective Comput."],"published-print":{"date-parts":[[2023,10,1]]},"DOI":"10.1109\/taffc.2023.3258900","type":"journal-article","created":{"date-parts":[[2023,3,17]],"date-time":"2023-03-17T17:22:39Z","timestamp":1679073759000},"page":"3231-3243","source":"Crossref","is-referenced-by-count":33,"title":["Applying Segment-Level Attention on Bi-Modal Transformer Encoder for Audio-Visual Emotion Recognition"],"prefix":"10.1109","volume":"14","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-5548-2509","authenticated-orcid":false,"given":"Jia-Hao","family":"Hsu","sequence":"first","affiliation":[{"name":"Graduate Computer Science and Information Engineering, National Cheng Kung University, Tainan, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3947-2123","authenticated-orcid":false,"given":"Chung-Hsien","family":"Wu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Information Engineering, National Cheng Kung University, Tainan, Taiwan"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/34.895976"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682283"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/s10772-019-09628-3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ACII.2017.8273599"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-41299-9_34"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-917"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/1027933.1027958"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.77"},{"key":"ref9","article-title":"A framework for emotion recognition from human computer interaction in natural setting","volume-title":"Proc. 22nd ACM SIGKDD Conf. Knowl. Discov. Data Mining, Workshop Issues Sentiment Discov. Opin. Mining","author":"Constantine"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.cmpb.2022.106646"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-30036-4_26"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1523\/JNEUROSCI.07-10-03215.1987"},{"key":"ref14","volume-title":"The Merging of the Senses","author":"Stein","year":"1993"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3133195"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/s10772-012-9172-2"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.3390\/s17071694"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2019.2961089"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3076364"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2019.2899884"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3423327.3423672"},{"key":"ref22","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-703"},{"key":"ref24","article-title":"A fine-tuned wav2vec 2.0\/hubert benchmark for speech emotion recognition, speaker verification and spoken language understanding","author":"Wang","year":"2021"},{"key":"ref25","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2017.2784096"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2886767"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2017.01.012"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3143011"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1017\/ATSIP.2014.11"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2013.2269314"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2008.52"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3143009"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1193"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053012"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TechSym.2016.7872646"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2021.03.007"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746924"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2011.2171334"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s11063-021-10713-5"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.challengehml-1.1"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2018.2890471"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2016.2553038"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1037\/a0025827"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2013.6553805"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICDEW.2006.145"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2012.26"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/SIU.2010.5649919"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1874246"},{"issue":"3","key":"ref53","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1961189.1961199","article-title":"LIBSVM: A library for support vector machines","volume":"2","author":"Chang","year":"2011","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2011-815"},{"key":"ref55","first-page":"377","article-title":"An automatic prosody tagger for spontaneous speech","volume-title":"Proc. 26th Int. Conf. Comput. Linguistics: Tech. Papers","author":"Dom\u00ednguez"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1037\/0022-3514.49.5.1416"},{"issue":"1","key":"ref57","first-page":"62","article-title":"Kappa coefficient: A popular measure of rater agreement","volume":"27","author":"Wan","year":"2015","journal-title":"Shanghai Arch. psychiatry"},{"key":"ref58","article-title":"Max-margin object detection","author":"King","year":"2015"},{"key":"ref59","first-page":"1305","article-title":"Convolutional neural tensor network architecture for community-based question answering","volume-title":"Proc.24th Int. Joint Conf. Artif. Intell.","author":"Qiu"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-short.76"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1162\/089976699300016007"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.56645\/jmde.v8i17.336"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2021.3055755"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2018.06.003"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.3390\/app10207239"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1382"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016818"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.challengehml-1.3"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3167013"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3164745"}],"container-title":["IEEE Transactions on Affective Computing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5165369\/10330166\/10075429.pdf?arnumber=10075429","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,3]],"date-time":"2024-03-03T07:19:47Z","timestamp":1709450387000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10075429\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,1]]},"references-count":69,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/taffc.2023.3258900","relation":{},"ISSN":["1949-3045","2371-9850"],"issn-type":[{"value":"1949-3045","type":"electronic"},{"value":"2371-9850","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10,1]]}}}