{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,2]],"date-time":"2026-03-02T22:12:01Z","timestamp":1772489521655,"version":"3.50.1"},"reference-count":62,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62172300"],"award-info":[{"award-number":["62172300"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372326"],"award-info":[{"award-number":["62372326"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62202336"],"award-info":[{"award-number":["62202336"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["2024-4-YB-03"],"award-info":[{"award-number":["2024-4-YB-03"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Artif. Intell."],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1109\/tai.2025.3602409","type":"journal-article","created":{"date-parts":[[2025,8,25]],"date-time":"2025-08-25T20:50:44Z","timestamp":1756155044000},"page":"1533-1545","source":"Crossref","is-referenced-by-count":0,"title":["MSAF: Multimodal Sentiment Detection via Multiscale Adaptive Fusion"],"prefix":"10.1109","volume":"7","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2313-7635","authenticated-orcid":false,"given":"Jihong","family":"Guan","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1150-1569","authenticated-orcid":false,"given":"Yulou","family":"Shu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5376-043X","authenticated-orcid":false,"given":"Wuchao","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8768-6740","authenticated-orcid":false,"given":"Wengen","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Tongji University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1949-2768","authenticated-orcid":false,"given":"Shuigeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Computer Science, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9931-4733","authenticated-orcid":false,"given":"Yichao","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Tongji University, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"36","article-title":"Robust sentiment detection on Twitter from biased and noisy data","author":"Barbosa","year":"2010","journal-title":"Proc. Coling: Posters"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2804381.2804387"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2009.02.063"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/COMPTELIX.2017.8004002"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.dcan.2021.10.003"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/s13278-021-00776-6"},{"key":"ref7","article-title":"Thumbs up? Sentiment classification using machine learning techniques","author":"Pang","year":"2002"},{"key":"ref8","article-title":"XLNet: Generalized autoregressive pretraining for language understanding","author":"Yang","year":"2019"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2939744"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654930"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3132847.3133142"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210093"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301305"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.3301371"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3160060"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.175"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.287"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111848"},{"key":"ref19","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"ref20","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li","year":"2022"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2024.3396144"},{"key":"ref22","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"Lu","year":"2019","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"ref23","article-title":"VisualBERT: A simple and performant baseline for vision and language","author":"Li","year":"2019"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref25","first-page":"5579","article-title":"Vinvl: Making visual representations matter in vision-language models","volume-title":"Proc. CVPR","author":"Zhang","year":"2021"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref27","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00532"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/503"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.107676"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2021.3062200"},{"key":"ref32","article-title":"Efficient estimation of word representations in vector space","author":"Mikolov","year":"2013"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-018-1236-4"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2007.06.003"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-85099-1_8"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2017.08.003"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ISI.2017.8004895"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3035277"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2022.103193"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.108107"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3285238"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.28"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i8.26138"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/s11063-022-11124-w"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.122731"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref49","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2020"},{"key":"ref50","article-title":"Sparse DETR: Efficient end-to-end object detection with learnable sparsity","author":"Roh","year":"2021"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-27674-8_2"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p19-1239"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/d14-1181"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.ijepes.2019.02.022"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref57","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref58","first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim","year":"2021"},{"key":"ref59","article-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023"},{"key":"ref60","article-title":"Llava-Onevision: Easy visual task transfer","author":"Li","year":"2024"},{"key":"ref61","article-title":"VAuLT: Augmenting the vision-and-language transformer with the propagation of deep language representations","author":"Chochlakis","year":"2022"},{"key":"ref62","article-title":"Pytorch library for cam methods","year":"2021"}],"container-title":["IEEE Transactions on Artificial Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/9078688\/11417361\/11141507.pdf?arnumber=11141507","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,2]],"date-time":"2026-03-02T20:58:58Z","timestamp":1772485138000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11141507\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":62,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tai.2025.3602409","relation":{},"ISSN":["2691-4581"],"issn-type":[{"value":"2691-4581","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3]]}}}