{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T17:34:16Z","timestamp":1779212056714,"version":"3.51.4"},"reference-count":147,"publisher":"Tech Science Press","issue":"1","license":[{"start":{"date-parts":[[2024,7,19]],"date-time":"2024-07-19T00:00:00Z","timestamp":1721347200000},"content-version":"vor","delay-in-days":200,"URL":"https:\/\/doi.org\/10.32604\/TSP-CROSSMARKPOLICY"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2024]]},"DOI":"10.32604\/cmc.2024.053204","type":"journal-article","created":{"date-parts":[[2024,7,18]],"date-time":"2024-07-18T09:57:29Z","timestamp":1721296649000},"page":"1-35","update-policy":"https:\/\/doi.org\/10.32604\/tsp-crossmarkpolicy","source":"Crossref","is-referenced-by-count":88,"title":["A Comprehensive Survey on Deep Learning Multi-Modal Fusion: Methods, Technologies and Applications"],"prefix":"10.32604","volume":"80","author":[{"given":"Tianzhe","family":"Jiao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chaopeng","family":"Guo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoyue","family":"Feng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuming","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"17807","published-online":{"date-parts":[[2024]]},"reference":[{"key":"ref1","doi-asserted-by":"crossref","first-page":"164","DOI":"10.1016\/j.ins.2022.12.014","article-title":"Analysis of multimodal data fusion from an information theory perspective","volume":"623","author":"Dai","year":"Apr. 2023","journal-title":"Inf. Sci."},{"key":"ref2","doi-asserted-by":"crossref","first-page":"2381","DOI":"10.3390\/s23052381","article-title":"Effective techniques for multi-modal data fusion: A comparative analysis","volume":"23","author":"Paw\u0142owski","year":"Feb. 2023","journal-title":"Sensors"},{"key":"ref3","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3696453","article-title":"Privacy and integrity protection for IoT multimodal data using machine learning and blockchain","volume":"20","author":"Liu","year":"Mar. 2024","journal-title":"ACM Trans. Multim. Comput. Commun. Appl."},{"key":"ref4","series-title":"Proc. Intell. Human Comput. Interact.","first-page":"63","article-title":"Multimodal human computer interaction using hand gestures and speech","author":"Ridhun","year":"Apr. 2023"},{"key":"ref5","doi-asserted-by":"crossref","first-page":"2280","DOI":"10.1109\/TMM.2022.3145663","article-title":"Multi-modality sensing and data fusion for multi-vehicle detection","volume":"25","author":"Roy","year":"Jan. 2023","journal-title":"IEEE Trans. Multimedia"},{"key":"ref6","first-page":"3605","article-title":"Multi-modal medical image fusion towards future research: A review","volume":"35","author":"Khan","year":"Sep. 2023","journal-title":"J. King Saud Univ.\u2014Comput. Inf. Sci."},{"key":"ref7","doi-asserted-by":"crossref","first-page":"65","DOI":"10.1109\/35.41402","article-title":"Integration of acoustic and visual speech signals using neural networks","volume":"27","author":"Yuhas","year":"Nov. 1989","journal-title":"IEEE Commun. Mag."},{"key":"ref8","first-page":"1","article-title":"Foundations & trends in multimodal machine learning: Principles, challenges, and open questions","volume":"12","author":"Liang","year":"Apr. 2024","journal-title":"ACM Comput. Surv."},{"key":"ref9","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","article-title":"Multimodal machine learning: A survey and taxonomy","volume":"41","author":"Baltru\u0161aitis","year":"Feb. 2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"ref10","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.imavis.2020.104042","article-title":"Deep multi-modal fusion for semantic image segmentation: A survey","volume":"105","author":"Zhang","year":"Jan. 2021","journal-title":"Image Vis. Comput."},{"key":"ref11","series-title":"Proc. IEEE\/CVF Conf. Comput. Vision Pattern Recognit.","first-page":"4603","article-title":"PointPainting: Sequential fusion for 3D object detection","author":"Vora","year":"Jun. 2020"},{"key":"ref12","doi-asserted-by":"crossref","first-page":"2122","DOI":"10.1007\/s11263-023-01784-z","article-title":"Multi-modal 3D object detection in autonomous driving: A survey","volume":"131","author":"Wang","year":"May 2023","journal-title":"Int. J. Comput. Vis."},{"key":"ref13","series-title":"NIPS'17: Proc. 31st Int. Conf. Neural Inf. Process. Syst.","first-page":"6000","article-title":"Attention is all you need","author":"Vaswani","year":"Dec. 2017"},{"key":"ref14","series-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vision","first-page":"18021","article-title":"ObjectFusion: Multi-modal 3D object detection with object-centric fusion","author":"Cai","year":"Oct. 2023"},{"key":"ref15","series-title":"Proc. IEEE\/CVF Conf. Comput. Vision Pattern Recognit.","first-page":"21653","article-title":"Virtual sparse convolution for multimodal 3D object detection","author":"Wu","year":"Jun. 2023"},{"key":"ref16","series-title":"Proc. IEEE\/CVF Conf. Comput. Vision Pattern Recognit.","first-page":"23799","article-title":"Continual detection transformer for incremental object detection","author":"Liu","year":"Jun. 2023"},{"key":"ref17","series-title":"Proc. IEEE\/CVF Conf. Comput. Vision Pattern Recognit.","first-page":"3238","article-title":"DynStatF: An efficient feature fusion strategy for LiDAR 3D object detection","author":"Rong","year":"Jun. 2023"},{"key":"ref18","series-title":"Proc. IEEE\/CVF Conf. Comput. Vision Pattern Recognit.","first-page":"3354","article-title":"Are we ready for autonomous driving? The kitti vision bench-mark suite","author":"Geiger","year":"Jun. 2012"},{"key":"ref19","series-title":"Proc. IEEE\/CVF Conf. Comput. Vision Pattern Recognit.","first-page":"2446","article-title":"Scalability in perception for autonomous driving: Waymo open dataset","author":"Sun","year":"Jun. 2020"},{"key":"ref20","series-title":"Proc. IEEE\/CVF Conf. Comput. Vision Pattern Recognit.","first-page":"11621","article-title":"nuScenes: A multi-modal dataset for autonomous driving","author":"Caesar","year":"Jun. 2020"},{"key":"ref21","doi-asserted-by":"crossref","first-page":"2702","DOI":"10.1109\/TPAMI.2019.2926463","article-title":"The apolloscape dataset for autonomous driving","volume":"42","author":"Huang","year":"Jul. 2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"ref22","series-title":"Proc. Int. Conf. Robotics Automat.","first-page":"2267","author":"Pham","year":"May 2020"},{"key":"ref23","series-title":"Proc. IEEE Int. Intell. Transp. Syst. Conf. (ITSC)","first-page":"3095","article-title":"PandaSet: Advanced sensor suite dataset for autonomous driving","author":"Xiao","year":"Sep. 2021"},{"key":"ref24","series-title":"Proc. Int. Conf. Robot. Automat.","first-page":"5744","article-title":"Cirrus: A long-range bi-pattern lidar dataset","author":"Wang","year":"May 2021"},{"key":"ref25","series-title":"Proc. Int. Conf. Robot. Automat.","first-page":"9552","article-title":"The H3D dataset for full-surround 3D multi-object detection and tracking in crowded urban scenes","author":"Patil","year":"May 2019"},{"key":"ref26","series-title":"Proc. IEEE\/CVF Conf. Comput. Vision Pattern Recognit.","first-page":"8748","article-title":"Argoverse: 3D tracking and forecasting with rich maps","author":"Chang","year":"Jun. 2019"},{"key":"ref27","series-title":"Proc. NeurIPS Datasets Benchmarks","first-page":"1","article-title":"One million scenes for autonomous driving: Once dataset","author":"Mao","year":"Dec. 2021"},{"key":"ref28","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/T-AFFC.2011.15","article-title":"DEAP: A database for emotion analysis; using physiological signals","volume":"3","author":"Koelstra","year":"Jan. 2012","journal-title":"IEEE Trans. Affect. Comput."},{"key":"ref29","doi-asserted-by":"crossref","first-page":"162","DOI":"10.1109\/TAMD.2015.2431497","article-title":"Investigating critical frequency bands and channels for EEG-based emotion recognition with deep neural networks","volume":"7","author":"Zheng","year":"Sep. 2015","journal-title":"IEEE Trans. Auton. Mental Dev."},{"key":"ref30","doi-asserted-by":"crossref","first-page":"1993","DOI":"10.1109\/TMI.2014.2377694","article-title":"The multi-modal brain tumor image segmentation benchmark (BRATS)","volume":"34","author":"Menze","year":"Oct. 2015","journal-title":"IEEE Trans. Med. Imaging"},{"key":"ref31","first-page":"68","article-title":"Review the cancer genome atlas (TCGA): An immeasurable source of knowledge","volume":"19","author":"Tomczak","year":"Jan. 2015","journal-title":"Contemp. Oncol."},{"key":"ref32","unstructured":"C. Wu, X. Zhang, Y. Zhang, Y. Wang, and W. Xie, \u201cTowards generalist foundation model for radiology,\u201d arXiv preprint arXiv:2308.02463, 2023."},{"key":"ref33","unstructured":"C. Wu, X. Zhang, Y. Wang, Y. Zhang, and W. Xie, \u201cK-Diag: Knowledge-enhanced disease diagnosis in radiographic imaging,\u201d arXiv preprint arXiv:2302.11557, 2023."},{"key":"ref34","unstructured":"X. Zhang et al., \u201cPMC-VQA: Visual instruction tuning for medical visual question answering,\u201d arXiv preprint arXiv:2305.10415, 2023."},{"key":"ref35","series-title":"Proc. IEEE 18th Int. Symp. Biomed. Imag.","first-page":"1650","article-title":"Slake: A semantically-labeled knowledge-enhanced dataset for medical visual question answering","author":"Liu","year":"Apr. 2021"},{"key":"ref36","doi-asserted-by":"crossref","first-page":"277","DOI":"10.1038\/s41597-023-02100-7","article-title":"VinDr-Mammo: A large-scale benchmark dataset for computer-aided diagnosis in full-field digital mammography","volume":"10","author":"Nguyen","year":"May 2023","journal-title":"Sci. Data"},{"key":"ref37","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1016\/j.inffus.2016.10.004","article-title":"A review of natural language processing techniques for opinion mining systems","volume":"36","author":"Sun","year":"Jul. 2017","journal-title":"Inf. Fusion"},{"key":"ref38","doi-asserted-by":"crossref","first-page":"1479","DOI":"10.1007\/s10462-023-10555-8","article-title":"A comprehensive survey on deep learning-based approaches for multimodal sentiment analysis","volume":"56","author":"Ghorbanali","year":"Jul. 2023","journal-title":"Artif. Intell. Rev."},{"key":"ref39","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/2897739","article-title":"Multi-modal analysis and prediction of persuasiveness in online social multimedia","volume":"6","author":"Park","year":"Oct. 2016","journal-title":"ACM Trans. Interact. Intell. Syst."},{"key":"ref40","series-title":"Proc. Assoc. Comput. Linguistics","first-page":"2236","article-title":"Multi-modal language analysis in the wild: CMU-MOSEI dataset and interpretable dynamic fusion graph","author":"Zadeh","year":"Jul. 2018"},{"key":"ref41","series-title":"Proc. Assoc. Comput. Linguistics","first-page":"3718","article-title":"CH-SIMS: A Chinese multi-modal sentiment analysis dataset with fine-grained annotation of modality","author":"Yu","year":"Jul. 2020"},{"key":"ref42","series-title":"Proc. Int. Conf. Multimodal Interfaces","first-page":"169","article-title":"Towards multi-modal sentiment analysis: Harvesting opinions from the web","author":"Morency","year":"Nov. 2011"},{"key":"ref43","unstructured":"A. Zadeh, R. Zellers, E. Pincus, and L. P. Morency, \u201cMOSI: Multi-modal corpus of sentiment intensity and subjectivity analysis in online opinion videos,\u201d arXiv preprint arXiv:1606.06259, 2016."},{"key":"ref44","doi-asserted-by":"crossref","first-page":"1334","DOI":"10.1109\/TAFFC.2021.3097002","article-title":"The multi-modal sentiment analysis in car reviews (MuSe-CaR) dataset: Collection, insights and improvements","volume":"14","author":"Stappen","year":"Apr. 2021","journal-title":"IEEE Trans. Affect. Comput."},{"key":"ref45","doi-asserted-by":"crossref","unstructured":"S. Poria, D. Hazarika, N. Majumder, G. Naik, E. Cambria, and R. Mihalcea, \u201cMELD: A multi-modal multi-party dataset for emotion recognition in conversations,\u201d arXiv preprint arXiv:1810.02508, 2018.","DOI":"10.18653\/v1\/P19-1050"},{"key":"ref46","series-title":"Proc. DE-FACTIFY@AAAI","first-page":"1","article-title":"Memotion 2: Dataset on sentiment and emotion analysis of memes","author":"Ramamoorthy","year":"Feb. 2022"},{"key":"ref47","series-title":"Proc. DE-FACTIFY@AAAI","first-page":"1","article-title":"FACTIFY: A multi-modal fact verification dataset","author":"Mishra","year":"Feb. 2022"},{"key":"ref48","series-title":"Proc. Int. Conf. Multimodal Interact","first-page":"400","article-title":"Introducing wesad, a multi-modal dataset for wearable stress and affect detection","author":"Schmidt","year":"Oct. 2018"},{"key":"ref49","doi-asserted-by":"crossref","first-page":"1798","DOI":"10.1109\/TPAMI.2013.50","article-title":"Representation learning: A review and new perspectives","volume":"38","author":"Bengio","year":"Mar. 2013","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"ref50","series-title":"Proc. Int. Conf. Data Sci. Cyber.","first-page":"68","article-title":"Multi-modal knowledge representation: A survey","author":"Hu","year":"Aug. 2023"},{"key":"ref51","series-title":"Proc. ACM Multimedia","first-page":"5985","article-title":"Graph to grid: Learning deep representations for multimodal emotion recognition","author":"Jin","year":"Oct. 2023"},{"key":"ref52","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.eswa.2023.119790","article-title":"MMHFNet: Multi-modal and multi-layer hybrid fusion network for voice pathology detection","volume":"223","author":"Mohammed","year":"Aug. 2023","journal-title":"Expert Syst. Appl."},{"key":"ref53","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.inffus.2023.102128","article-title":"Multi-modal person re-identification based on transformer relational regularization","volume":"103","author":"Zheng","year":"Mar. 2024","journal-title":"Inf. Fusion"},{"key":"ref54","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.patcog.2023.110086","article-title":"Joint representation learning for text and 3D point cloud","volume":"147","author":"Huang","year":"Mar. 2024","journal-title":"Pattern Recognit."},{"key":"ref55","doi-asserted-by":"crossref","first-page":"2701","DOI":"10.1109\/JIOT.2023.3292376","article-title":"WiVi-GR: Wireless-visual joint representation-based accurate gesture recognition","volume":"11","author":"Liu","year":"Jan. 2024","journal-title":"IEEE Internet Things J."},{"key":"ref56","doi-asserted-by":"crossref","first-page":"1057","DOI":"10.1007\/s00530-022-01038-x","article-title":"Image-text matching using multi-subspace joint representation","volume":"29","author":"Sun","year":"Jan. 2023","journal-title":"Multim. Syst."},{"key":"ref57","doi-asserted-by":"crossref","first-page":"63373","DOI":"10.1109\/ACCESS.2019.2916887","article-title":"Deep multimodal representation learning: A survey","volume":"7","author":"Guo","year":"May 2019","journal-title":"IEEE Access"},{"key":"ref58","doi-asserted-by":"crossref","first-page":"1765","DOI":"10.1007\/s11263-019-01290-1","article-title":"RGB-IR person re-identification by cross-modality similarity preservation","volume":"128","author":"Wu","year":"Feb. 2020","journal-title":"Int. J. Comput. Vis."},{"key":"ref59","first-page":"1","article-title":"Correlation-guided discriminative cross-modality features network for infrared and visible image fusion","volume":"73","author":"Cai","year":"Dec. 2023","journal-title":"IEEE Trans. Instrum. Meas."},{"key":"ref60","doi-asserted-by":"crossref","first-page":"1213","DOI":"10.1007\/s13042-023-01964-w","article-title":"CoDF\u2010Net: Coordinated\u2010representation decision fusion network for emotion recognition with EEG and eye movement signals","volume":"15","author":"Gong","year":"Apr. 2024","journal-title":"Int. J. Mach. Learn. Cybern."},{"key":"ref61","series-title":"Proc. Med. Imag. Comput. Comput. Assisted Intervention","first-page":"745","article-title":"UWAT-GAN: Fundus fluorescein angiography synthesis via ultra-wide-angle transformation multi-scale GAN","author":"Fang","year":"Oct. 2023"},{"key":"ref62","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1023\/A:1020346032608","article-title":"Natural language description of human activities from video images based on concept hierarchy of actions","volume":"50","author":"Kojima","year":"Nov. 2002","journal-title":"Int. J. Comput. Vis."},{"key":"ref63","series-title":"Proc. 13th Conf. Eur. Chapter Assoc. for Comput. Linguistics","first-page":"747","article-title":"Midge: Generating image descriptions from computer vision detections","author":"Mitchell","year":"Apr. 2012"},{"key":"ref64","series-title":"Proc. 2013 Conf. Empirical Methods in Natural Lang. Processs","first-page":"1292","article-title":"Image description using visual dependency representations","author":"Elliott","year":"Oct. 2013"},{"key":"ref65","series-title":"Proc. Conf. North Amer. Chapt. Assoc. Comput. Linguistics: Human Lang. Technol.","first-page":"1494","article-title":"Translating videos to natural language using deep recurrent neural networks","author":"Venugopalan","year":"Jun. 2015"},{"key":"ref66","series-title":"Pattern Recog.: 37th, German Conf., GCPR 2015","first-page":"209","article-title":"The long-short story of movie description","author":"Rohrbach","year":"2015"},{"key":"ref67","doi-asserted-by":"crossref","first-page":"215","DOI":"10.1016\/j.inffus.2023.02.002","article-title":"HS2P: Hierarchical spectral and structure-preserving usion network for multimodal remote sensing image cloud and shadow removal","volume":"94","author":"Li","year":"Jun. 2023","journal-title":"Inf. Fusion."},{"key":"ref68","series-title":"Proc. Assoc. Comput. Linguistics","first-page":"8590","article-title":"AV-TranSpeech: Audio-visual robust speech-to-speech translation","author":"Huang","year":"Jul. 2023"},{"key":"ref69","doi-asserted-by":"crossref","first-page":"1506","DOI":"10.1109\/TASLP.2024.3363444","article-title":"METTS: Multilingual emotional text-to-speech by cross-speaker and cross-lingual emotion transfer","volume":"32","author":"Zhu","year":"Feb. 2024","journal-title":"IEEE ACM Trans. Audio Speech Lang. Process."},{"key":"ref70","series-title":"Proc. 57th Annu. Meeting Assoc. Comput. Linguistics","first-page":"3585","article-title":"Generating question relevant captions to aid visual question answering","author":"Wu","year":"Jul. 2019"},{"key":"ref71","series-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vision","first-page":"20098","article-title":"Going beyond nouns with vision & language models using synthetic data","author":"Cascante-Bonilla","year":"Oct. 2023"},{"key":"ref72","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.inffus.2023.102031","article-title":"Disentanglement translation network for multimodal sentiment analysis","volume":"102","author":"Zeng","year":"Feb. 2024","journal-title":"Inf. Fusion"},{"key":"ref73","doi-asserted-by":"crossref","first-page":"289","DOI":"10.1007\/s12559-022-10073-9","article-title":"TEDT: Transformer-based encoding-decoding translation network for multimodal sentiment analysis","volume":"15","author":"Wang","year":"Jan. 2023","journal-title":"Cogn. Comput."},{"key":"ref74","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"9341","article-title":"CAMEL: Capturing metaphorical alignment with context disentangling for multimodal emotion recognition","author":"Zhang","year":"Mar. 2024"},{"key":"ref75","unstructured":"W. Elisa et al., \u201cMulti-modal machine learning in image-based and clinical biomedicine: Survey and prospects,\u201d arXiv preprint arXiv:2311.02332, 2023."},{"key":"ref76","series-title":"Proc. ACM Multimedia","first-page":"9487","article-title":"Answer-based entity extraction and alignment for visual text question answering","author":"Yu","year":"Oct. 2023"},{"key":"ref77","doi-asserted-by":"crossref","first-page":"124","DOI":"10.1007\/s41019-023-00208-9","article-title":"Probing the impacts of visual context in multimodal entity alignment","volume":"8","author":"Wang","year":"Apr. 2023","journal-title":"Data Sci. Eng."},{"key":"ref78","doi-asserted-by":"crossref","first-page":"278","DOI":"10.1109\/TAI.2023.3254518","article-title":"Adversarial modality alignment network for cross-modal molecule retrieval","volume":"5","author":"Zhao","year":"Jan. 2024","journal-title":"IEEE Trans. Artif. Intell."},{"key":"ref79","first-page":"1","article-title":"Structural regression fusion for unsupervised multimodal change detection","volume":"61","author":"Sun","year":"Jul. 2023","journal-title":"IEEE Trans. Geosci. Remote. Sens."},{"key":"ref80","doi-asserted-by":"crossref","first-page":"5605","DOI":"10.1109\/TCSVT.2023.3262685","article-title":"EAF-WGAN: Enhanced alignment fusion-wasserstein generative adversarial network for turbulent image restoration","volume":"33","author":"Liu","year":"Oct. 2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"ref81","series-title":"Proc. EEE\/CVF Conf. Comput. Vis. Pattern Recognit.","first-page":"122","article-title":"Multi-modal domain adaptation for fine-grained action recognition","author":"Munro","year":"Jun. 2020"},{"key":"ref82","series-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis.","first-page":"7464","article-title":"VideoBERT: A joint model for video and language representation learning","author":"Sun","year":"Oct. 2019"},{"key":"ref83","series-title":"Med. Imag. Comput. Comput. Assisted Intervention\u2013MICCAI 2023: 26th Int. Conf.","first-page":"704","article-title":"Representation, alignment, fusion: A generic transformer-based framework for multi-modal glaucoma recognition","author":"Zhou","year":"Oct. 8\u201312, 2023"},{"key":"ref84","series-title":"Proc. Thirty-First Int. Joint Conf. Artif. Intell.","first-page":"827","article-title":"AutoAlign: Pixel-instance feature aggregation for multi-modal 3D object detection","author":"Chen","year":"Jul. 2022"},{"key":"ref85","series-title":"Proc.\n\nIEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","first-page":"17161","article-title":"DeepFusion: LiDAR-camera deep fusion for multi-modal 3D object detection","author":"Li","year":"2022"},{"key":"ref86","doi-asserted-by":"crossref","first-page":"722","DOI":"10.1109\/TITS.2020.3023541","article-title":"Deep learning for image and point cloud fusion in autonomous driving: A review","volume":"23","author":"Cui","year":"Feb. 2021","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"ref87","doi-asserted-by":"crossref","first-page":"3412","DOI":"10.1109\/TNNLS.2020.3015992","article-title":"Deep learning for lidar points clouds in autonomous driving: A review","volume":"32","author":"Li","year":"Aug. 2020","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"ref88","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"12460","article-title":"PI-RCNN: An efficient multi-sensor 3D object detector with point-based attentive cont-conv fusion module","author":"Xie","year":"Apr. 2020"},{"key":"ref89","series-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","first-page":"77","article-title":"Pointnet: Deep learning on point sets for 3D classification and segmentation","author":"Charles","year":"Jul. 2017"},{"key":"ref90","first-page":"1","article-title":"A multilevel multimodal fusion transformer for remote sensing semantic segmentation","volume":"62","author":"Ma","year":"Mar. 2024","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"ref91","doi-asserted-by":"crossref","first-page":"281","DOI":"10.1016\/j.inffus.2023.02.005","article-title":"Visual tracking in complex scenes: A location fusion mechanism based on the combination of multiple visual cognition flows","volume":"96","author":"Liu","year":"Aug. 2023","journal-title":"Inf. Fusion"},{"key":"ref92","series-title":"2019 IEEE\/CVF Conf. Comput. Vision Pattern Recogn. Workshops (CVPRW)","first-page":"1230","article-title":"Sensor fusion for joint 3D object detection and semantic segmentation","author":"Meyer","year":"Jun. 2019"},{"key":"ref93","doi-asserted-by":"crossref","first-page":"1895","DOI":"10.3390\/rs12111895","article-title":"KDA3D: Key-point densification and multi-attention guidance for 3D object detection","volume":"12","author":"Wang","year":"Jun. 2020","journal-title":"Remote Sens."},{"key":"ref94","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"7460","article-title":"SimDistill: Simulated multi-modal distillation for BEV 3D object detection","author":"Zhao","year":"Mar. 2024"},{"key":"ref95","series-title":"Proc. Eur. Conf. Comput. Vision (ECCV) 2020","first-page":"35","article-title":"EPNet: Enhancing point features with image semantics for 3D object detection","author":"Huang","year":"Aug. 2020"},{"key":"ref96","series-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","first-page":"11679","article-title":"Seeing through fog without seeing fog: Deep multi-modal sensor fusion in unseen adverse weather","author":"Bijelic","year":"Jun. 2020"},{"key":"ref97","series-title":"Proc. Eur. Conf. Comput. Visson (ECCV) 2020","first-page":"496","article-title":"RadarNet: Exploiting radar for robust perception of dynamic objects","author":"Yang","year":"Dec. 2020"},{"key":"ref98","series-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","first-page":"444","article-title":"Robust multi-modal vehicle detection in foggy weather using complementary lidar and radar signals","author":"Qian","year":"Jun. 2021"},{"key":"ref99","series-title":"Proc. 2020 IEEE\/RSJ Int. Conf. Intell. Robot. Syst.","first-page":"10386","article-title":"CLOCs: Camera-LiDAR object candidates fusion for 3D object detection","author":"Pang","year":"Oct. 2020"},{"key":"ref100","doi-asserted-by":"crossref","first-page":"22080","DOI":"10.1109\/ACCESS.2021.3055491","article-title":"Fast and accurate 3D object detection for lidar-camera-based autonomous vehicles using one shared voxel-based backbone","volume":"9","author":"Wen","year":"Jan. 2021","journal-title":"IEEE Access"},{"key":"ref101","series-title":"Proc. Comput. Vis.-ECCV 2020","first-page":"720","article-title":"3D-CVF: Generating joint camera and lidar features using cross-view spatial feature fusion for 3D object detection","author":"Yoo","year":"Aug. 2020"},{"key":"ref102","unstructured":"Y. Kim, K. Park, M. Kim, D. Kum, and J. Won Choi, \u201c3D dual-fusion: Dual-domain dual-query camera-LiDAR fusion for 3D object detection,\u201d arXiv preprint arXiv:2211.13529, 2023."},{"key":"ref103","series-title":"Proc. Int. Conf. Robot. Automat","first-page":"7276","article-title":"MVX-Net: Multi-modal voxelnet for 3D object detection","author":"Sindagi","year":"May 2019"},{"key":"ref104","series-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","first-page":"7345","article-title":"Multi-task multi-sensor fusion for 3D object detection","author":"Liang","year":"Jun. 2019"},{"key":"ref105","doi-asserted-by":"crossref","first-page":"51710","DOI":"10.1109\/ACCESS.2021.3070379","article-title":"RoIFusion: 3D object detection from lidar and vision","volume":"9","author":"Chen","year":"Apr. 2021","journal-title":"IEEE Access"},{"key":"ref106","doi-asserted-by":"crossref","first-page":"3332","DOI":"10.1007\/s11263-023-01869-9","article-title":"GLENet: Boosting 3D object detectors with generative label uncertainty estimation","volume":"131","author":"Zhang","year":"Jul. 2023","journal-title":"Int. J. Comput. Vis."},{"key":"ref107","series-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","first-page":"14494","article-title":"SE-SSD: Self-ensembling single-stage object detector from point cloud","author":"Zheng","year":"Jun. 2021"},{"key":"ref108","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.jtbi.2024.111816","article-title":"A muti-modal feature fusion method based on deep learning for predicting immunotherapy response","volume":"586","author":"Li","year":"Jun. 2024","journal-title":"J. Theor. Biol."},{"key":"ref109","doi-asserted-by":"crossref","first-page":"1564","DOI":"10.1109\/JBHI.2023.3347794","article-title":"Dual-view learning based on images and sequences for molecular property prediction","volume":"28","author":"Zhang","year":"Mar. 2024","journal-title":"IEEE J. Biomed. Health Inf."},{"key":"ref110","doi-asserted-by":"crossref","DOI":"10.1093\/bioinformatics\/btad505","article-title":"scNCL: Transferring labels from scRNA-seq to scATAC-seq data with neighborhood contrastive regularization","volume":"39","author":"Yan","year":"Aug. 2023","journal-title":"Bioinform"},{"key":"ref111","doi-asserted-by":"crossref","first-page":"146","DOI":"10.1002\/ima.22310","article-title":"An improved multi-modal medical image fusion scheme based on hybrid combination of nonsubsampled contourlet transform and stationary wavelet transform","volume":"29","author":"Ramlal","year":"Jun. 2019","journal-title":"Int. J. Imaging Syst. Technol."},{"key":"ref112","doi-asserted-by":"crossref","first-page":"156","DOI":"10.1016\/j.inffus.2023.03.008","article-title":"A systematic review of trustworthy and explainable artificial intelligence in healthcare: Assessment of quality, bias risk, and data fusion","volume":"96","author":"Albahri","year":"Aug. 2023","journal-title":"Inf. Fusion"},{"key":"ref113","doi-asserted-by":"crossref","first-page":"133","DOI":"10.1109\/TAFFC.2020.3035535","article-title":"Interpretation of depression detection models via feature selection methods","volume":"14","author":"Alghowinem","year":"Nov. 2023","journal-title":"IEEE Trans. Affect. Comput."},{"key":"ref114","series-title":"Proc. BrainLes@Medical Imag. Comput. Comput. Assisted Intervention (MICCAI)","first-page":"311","article-title":"3D MRI brain tumor segmentation using autoencoder regularization","author":"Myronenko","year":"Jan. 2018"},{"key":"ref115","doi-asserted-by":"crossref","first-page":"102","DOI":"10.1016\/j.neucom.2021.09.032","article-title":"Feature-enhanced generation and multi-modality fusion based deep neural network for brain tumor segmentation with missing MR modalities","volume":"466","author":"Zhou","year":"Nov. 2021","journal-title":"Neurocomputing"},{"key":"ref116","doi-asserted-by":"crossref","first-page":"1763","DOI":"10.1109\/TMI.2021.3065918","article-title":"CANet: Context aware network for brain glioma segmentation","volume":"40","author":"Liu","year":"Mar. 2021","journal-title":"IEEE Trans. Med. Imaging"},{"key":"ref117","series-title":"Proc. Int. Conf. Bioinf. Comput. Biol.","first-page":"66","article-title":"SubOmiEmbed: Self-supervised repre-sentation learning of multi-omics data for cancer type classification","author":"Hashim","year":"Jun. 2022"},{"key":"ref118","series-title":"Proc. Neural Inf. Process. Syst. (NeurIPS) 2022","first-page":"1","article-title":"Flamingo: A visual language model for few-shot learning","author":"Alayrac","year":"Dec. 2022"},{"key":"ref119","series-title":"Proc. Int. Symp. Biomed. Imag.","first-page":"1","article-title":"Self-supervised visionlanguage pretraining for medical visual question answering","author":"Li","year":"Sep. 2023"},{"key":"ref120","unstructured":"K. Zhang et al., \u201cBiomedGPT: A unified and generalist biomedical generative pre-trained transformer for vision, language, and multi-modal tasks,\u201d arXiv preprint arXiv:2305.17100, 2023."},{"key":"ref121","unstructured":"S. Zhang et al., \u201cBiomedCLIP: A multi-modal biomedical foundation model pretrained from fifteen million scientific image-text pairs,\u201d arXiv preprint arXiv:2303.00915, 2024."},{"key":"ref122","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/2682899","article-title":"A review and meta-analysis of multi-modal affect detection systems","volume":"47","author":"D\u2019mello","year":"Feb. 2015","journal-title":"ACM Comput. Surv."},{"key":"ref123","series-title":"Proc. Assoc. Comput. Linguistics","first-page":"973","article-title":"Utterance-level multi-modal sentiment analysis","author":"P\u00e9rez-Rosas","year":"Aug. 2013"},{"key":"ref124","doi-asserted-by":"crossref","unstructured":"A. Zadeh, M. Chen, S. Poria, E. Cambria, and L. Morency, \u201cTensor fusion network for multi-modal sentiment analysis,\u201d arXiv preprint arXiv:1707.07250, 2017.","DOI":"10.18653\/v1\/D17-1115"},{"key":"ref125","series-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","first-page":"7837","article-title":"UniMSE: Towards unified multi-modal sentiment analysis and emotion recognition","author":"Hu","year":"Dec. 2022"},{"key":"ref126","series-title":"Proc. Assoc. Comput. Linguistics","first-page":"2247","article-title":"Efficient low-rank multi-modal fusion with modality-specific factors","author":"Liu","year":"Jul. 2018"},{"key":"ref127","series-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","first-page":"9180","article-title":"Improving multi-modal fusion with hierarchical mutual information maximization for multi-modal sentiment analysis","author":"Han","year":"Nov. 2021"},{"key":"ref128","series-title":"Proc. AAAI Conf. Artif. Intell.","first-page":"10790","article-title":"Learning modality-specific representations with self-supervised multi-task learning for multi-modal sentiment analysis","author":"Yu","year":"May 2021"},{"key":"ref129","series-title":"Proc. ACM Int. Conf. Multimodal Interact.","first-page":"284","article-title":"Deep multi-modal fusion for persuasiveness prediction","author":"Nojavanasghari","year":"Nov. 2016"},{"key":"ref130","series-title":"Proc. Assoc. Comput. Linguistics","first-page":"7900","article-title":"Speech-text dialog pre-training for spoken dialog understanding with explicit cross-modal alignment","author":"Yu","year":"Jul. 2023"},{"key":"ref131","unstructured":"Z. Wu, Z. Gong, J. Koo, and J. Hirschberg, \u201cMulti-modality multi-loss fusion network,\u201d arXiv preprint arXiv:2308.00264, 2023."},{"key":"ref132","series-title":"Proc. ACM Int. Conf. Multimedia","first-page":"6090","article-title":"Multi-label emotion analysis in conversation via multi-modal knowledge distillation","author":"Anand","year":"Oct. 2023"},{"key":"ref133","series-title":"Proc. Int. Conf. Acoust., Speech Signal Process.","first-page":"4573","article-title":"MMLatch: Bottom-up top-down fusion for multi-modal sentiment analysis","author":"Paraskevopoulos","year":"May 2022"},{"key":"ref134","series-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","first-page":"756","article-title":"Learning language-guided adaptive hyper-modality representation for multi-modal sentiment analysis","author":"Zhang","year":"Dec. 2023"},{"key":"ref135","doi-asserted-by":"crossref","first-page":"51315","DOI":"10.1109\/ACCESS.2022.3174215","article-title":"VAE-based adversarial multi-modal domain transfer for video-level sentiment analysis","volume":"10","author":"Wang","year":"May 2022","journal-title":"IEEE Access"},{"key":"ref136","doi-asserted-by":"crossref","unstructured":"D. Zhang et al., \u201cMM-LLMs: Recent advances in multimodal large language models,\u201d arXiv preprint arXiv:2401.13601, 2024.","DOI":"10.18653\/v1\/2024.findings-acl.738"},{"key":"ref137","series-title":"Proc. Int. Conf. Mach. Learn.","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"Jul. 2021"},{"key":"ref138","series-title":"Proc. Int. Conf. Mach. Learn.","first-page":"8821","article-title":"Zero-shot text-to-image generation","author":"Ramesh","year":"Jul. 2021"},{"key":"ref139","series-title":"2019 IEEE\/CVF Int. Conf. Comput. Vision (ICCV)","first-page":"4550","article-title":"Meta-Sim: Learning to generate synthetic datasets","author":"Kar","year":"Feb. 2020"},{"key":"ref140","doi-asserted-by":"crossref","first-page":"139","DOI":"10.1145\/3422622","article-title":"Generative adversarial networks","volume":"63","author":"Goodfellow","year":"Oct. 2020","journal-title":"Commun. ACM"},{"key":"ref141","series-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","first-page":"7652","article-title":"PIXOR: Real-time 3D object detection from point clouds","author":"Yang","year":"Jun. 2018"},{"key":"ref142","series-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","first-page":"2589","article-title":"Deep parametric continuous convolutional neural networks","author":"Wang","year":"Jun. 2018"},{"key":"ref143","series-title":"Proc. Comput. Vision\u2014Eur. Conf. Comput. Vision (ECCV) 2020","first-page":"685","article-title":"Searching efficient 3D architectures with sparse point-voxel convolution","author":"Tang","year":"Nov. 2020"},{"key":"ref144","doi-asserted-by":"crossref","first-page":"2608","DOI":"10.1109\/LRA.2018.2818933","article-title":"A brief survey on the role of dimensionality reduction in manipulation learning and control","volume":"3","author":"Ficuciello","year":"Jul. 2018","journal-title":"IEEE Robot. Autom. Lett."},{"key":"ref145","series-title":"Proc. Int. Conf. Robot. Automat.","first-page":"2774","article-title":"BEVFusion: Multi-task multi-sensor fusion with unified bird\u2019s-Eye view representation","author":"Liu","year":"Jul. 2023"},{"key":"ref146","series-title":"Proc. 35th Int. Conf. Neural Inf. Process. Syst.","first-page":"16494","article-title":"Multi-modal virtual point 3D detection","author":"Yin","year":"Dec. 2021"},{"key":"ref147","series-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit","first-page":"5408","article-title":"Sparse fuse dense: Towards high quality 3D detection with depth completion","author":"Wu","year":"Jun. 2022"}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.techscience.com\/files\/cmc\/2024\/TSP_CMC-80-1\/TSP_CMC_53204\/TSP_CMC_53204.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,6]],"date-time":"2025-03-06T11:43:20Z","timestamp":1741261400000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v80n1\/57427"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":147,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2024]]},"published-print":{"date-parts":[[2024]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2024.053204","relation":{},"ISSN":["1546-2226"],"issn-type":[{"value":"1546-2226","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"2024-04-27","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-06-17","order":1,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-07-18","order":2,"name":"published","label":"Published Online","group":{"name":"publication_history","label":"Publication History"}}]}}