{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,10]],"date-time":"2026-05-10T02:17:42Z","timestamp":1778379462724,"version":"3.51.4"},"reference-count":91,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076262"],"award-info":[{"award-number":["62076262"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/tmm.2023.3306489","type":"journal-article","created":{"date-parts":[[2023,8,18]],"date-time":"2023-08-18T17:22:46Z","timestamp":1692379366000},"page":"3018-3033","source":"Crossref","is-referenced-by-count":18,"title":["Multimodal Boosting: Addressing Noisy Modalities and Identifying Modality Contribution"],"prefix":"10.1109","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9763-375X","authenticated-orcid":false,"given":"Sijie","family":"Mai","sequence":"first","affiliation":[{"name":"School of Electronics and Information Technology, Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8765-9945","authenticated-orcid":false,"given":"Ya","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Electronics and Information Technology, Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2301-7897","authenticated-orcid":false,"given":"Aolin","family":"Xiong","sequence":"additional","affiliation":[{"name":"School of Electronics and Information Technology, Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8842-2045","authenticated-orcid":false,"given":"Ying","family":"Zeng","sequence":"additional","affiliation":[{"name":"School of Electronics and Information Technology, Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4884-323X","authenticated-orcid":false,"given":"Haifeng","family":"Hu","sequence":"additional","affiliation":[{"name":"School of Electronics and Information Technology, Sun Yat-sen University, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref2","first-page":"1","article-title":"MultiBench: Multiscale benchmarks for multimodal representation learning","volume-title":"Proc. 35th Conf. Neural Inf. Process. Syst. Datasets Benchmarks Track (Round 1)","author":"Liang","year":"2021"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2018.2872063"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1014"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2020.3000510"},{"key":"ref7","first-page":"1513","article-title":"A variational information bottleneck approach to multi-omics data integration","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"Lee","year":"2021"},{"key":"ref8","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2017.02.003"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1081"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1142\/9789812775320_0021"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413570"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3475957.3484451"},{"key":"ref14","article-title":"Multimodal fusion refiner networks","author":"Sankaran","year":"2021"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3082398"},{"key":"ref16","doi-asserted-by":"crossref","first-page":"6558","DOI":"10.18653\/v1\/P19-1656","article-title":"Multimodal transformer for unaligned multimodal language sequences","volume-title":"Proc. 57th Annu. Meeting Assoc. Comput. Linguistics","author":"Tsai","year":"2019"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.540"},{"key":"ref18","first-page":"481","article-title":"Divide, conquer and combine: Hierarchical feature fusion network with local and global perspectives for multimodal affective computing","volume-title":"Proc.IEEE 57th Conf. Assoc. Comput. Linguistics","author":"Mai","year":"2019"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3068598"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.109"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531900"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2008.2007344"},{"key":"ref23","first-page":"1401","article-title":"A brief introduction to boosting","volume-title":"Proc. Int. Joint Conf. Artif. Intell.","author":"Schapire","year":"1999"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00713"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-2096"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2016.94"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d17-1115"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5347"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.12.003"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2013.47"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/2993148.2993176"},{"key":"ref32","first-page":"1","article-title":"Ensemble of SVM trees for multimodal emotion recognition","volume-title":"Proc. IEEE Signal Inf. Process. Assoc. Summit Conf.","author":"Rozgic","year":"2012"},{"key":"ref33","article-title":"Neural language modeling with visual features","author":"Anastasopoulos","year":"2019"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414880"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2925966"},{"key":"ref37","first-page":"12113","article-title":"Deep multimodal multilinear fusion with high-order polynomial pooling","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Hou","year":"2019"},{"key":"ref38","first-page":"1","article-title":"Hadamard product for low-rank bilinear pooling","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kim","year":"2017"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2018.2817340"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2021.3057757"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3072412"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3284750"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016892"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3542927"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2022.3213589"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2019.06.013"},{"key":"ref48","first-page":"58","article-title":"Quantum-inspired multimodal fusion for video sentiment analysis","volume-title":"Inf. Fusion","volume":"65","author":"Li","year":"2021"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.143"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1034"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.401"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6431"},{"key":"ref53","first-page":"1","article-title":"Audio-visual fusion for sentiment classification using cross-modal autoencoder","volume-title":"Proc. 32nd Conf. Neural Inf. Process. Syst.","author":"Dumpala","year":"2018"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.83"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/taffc.2022.3178231"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref59","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu","year":"2019"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref62","first-page":"1","article-title":"XLNet: Generalized autoregressive pretraining for language understanding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yang","year":"2019"},{"key":"ref63","first-page":"1","article-title":"Hierarchical question-image co-attention for visual question answering","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu","year":"2016"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2022.3155290"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2019.2925204"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CIHLI.2013.6613272"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i02.5492"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136801"},{"key":"ref69","first-page":"1","article-title":"Learning sparse neural networks through L0 regularization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Louizos","year":"2018"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2022.3171679"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00900"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3479927"},{"key":"ref74","article-title":"Weakly-supervised multi-task learning for multimodal affect recognition","author":"Dai","year":"2021"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref76","first-page":"1532","article-title":"Glove: Global vectors for word representation","volume-title":"Proc. Empirical Methods Natural Lang. Process.","author":"Pennington","year":"2014"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p18-1208"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1007\/s10579-008-9076-6"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33017216"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475585"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/taffc.2022.3172360"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6853739"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413690"},{"key":"ref85","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"ref86","doi-asserted-by":"crossref","first-page":"184","DOI":"10.1016\/j.inffus.2020.09.005","volume":"66","author":"Gkoumas","year":"2021","journal-title":"Inf. Fusion"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3479919"},{"key":"ref88","article-title":"An empirical evaluation of generic convolutional and recurrent networks for sequence modeling","author":"Bai","year":"2018"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1406.1078"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.343"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/10384483\/10224356.pdf?arnumber=10224356","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T21:04:45Z","timestamp":1709327085000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10224356\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":91,"URL":"https:\/\/doi.org\/10.1109\/tmm.2023.3306489","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}