{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T23:54:33Z","timestamp":1773964473649,"version":"3.50.1"},"reference-count":56,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T00:00:00Z","timestamp":1775001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1016\/j.knosys.2026.115548","type":"journal-article","created":{"date-parts":[[2026,2,18]],"date-time":"2026-02-18T00:25:41Z","timestamp":1771374341000},"page":"115548","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["WiVi-UF: Unified feature learning in cross-modal transformers with WiFi and vision data fusion for enhanced human activity recognition"],"prefix":"10.1016","volume":"339","author":[{"given":"Xinhang","family":"Lin","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3958-7040","authenticated-orcid":false,"given":"Xianxun","family":"Zhu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3320-2526","authenticated-orcid":false,"given":"Erik","family":"Cambria","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"6","key":"10.1016\/j.knosys.2026.115548_bib0001","doi-asserted-by":"crossref","first-page":"4755","DOI":"10.1007\/s10462-021-10116-x","article-title":"Human activity recognition in artificial intelligence framework: a narrative review","volume":"55","author":"Gupta","year":"2022","journal-title":"Artif. Intell. Rev."},{"key":"10.1016\/j.knosys.2026.115548_bib0002","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2021.106970","article-title":"A review of multimodal human activity recognition with special emphasis on classification, applications, challenges and future directions","volume":"223","author":"Yadav","year":"2021","journal-title":"Knowl. Based. Syst."},{"issue":"9","key":"10.1016\/j.knosys.2026.115548_bib0003","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1186\/2192-1962-3-9","article-title":"Intention awareness: improving upon situation awareness in human-centric environments","volume":"3","author":"Howard","year":"2013","journal-title":"Hum.-Centric Comput. Inf. Sci."},{"issue":"5","key":"10.1016\/j.knosys.2026.115548_bib0004","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3705893","article-title":"Wi-Fi sensing techniques for human activity recognition: brief survey, potential challenges, and research directions","volume":"57","author":"Miao","year":"2025","journal-title":"ACM Comput. Surv."},{"issue":"4","key":"10.1016\/j.knosys.2026.115548_bib0005","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3607254","article-title":"AutoDLAR: a semi-supervised cross-modal contact-free human activity recognition system","volume":"20","author":"Lu","year":"2024","journal-title":"ACM Trans. Sens. Netw."},{"key":"10.1016\/j.knosys.2026.115548_bib0006","doi-asserted-by":"crossref","first-page":"504","DOI":"10.1109\/TRS.2024.3398127","article-title":"Corruption robustness analysis of radar micro-doppler classification for human activity recognition","volume":"2","author":"Zhou","year":"2024","journal-title":"IEEE Trans. Radar Syst."},{"key":"10.1016\/j.knosys.2026.115548_bib0007","doi-asserted-by":"crossref","first-page":"18578","DOI":"10.1109\/JSEN.2024.3388893","article-title":"Centaur: robust multimodal fusion for human activity recognition","volume":"24","author":"Xaviar","year":"2024","journal-title":"IEEE Sens. J."},{"issue":"4","key":"10.1016\/j.knosys.2026.115548_bib0008","doi-asserted-by":"crossref","first-page":"1874","DOI":"10.1109\/TAFFC.2024.3380375","article-title":"Fusion and discrimination: a multimodal graph contrastive learning framework for multimodal sarcasm detection","volume":"15","author":"Liang","year":"2024","journal-title":"IEEE Trans. Affect. Comput."},{"issue":"4","key":"10.1016\/j.knosys.2026.115548_bib0009","first-page":"1","article-title":"Deep heterogeneous contrastive hyper-graph learning for in-the-wild context-aware human activity recognition","volume":"7","author":"Ge","year":"2024","journal-title":"Proc. ACM Interact. Mobile Wearable Ubiquitous Technol."},{"key":"10.1016\/j.knosys.2026.115548_bib0010","series-title":"2024 IEEE International Symposium on Circuits and Systems (ISCAS)","first-page":"1","article-title":"Human activity recognition using Wi-Fi signals based on tokenized signals with attention","author":"Lee","year":"2024"},{"issue":"1","key":"10.1016\/j.knosys.2026.115548_bib0011","doi-asserted-by":"crossref","first-page":"487","DOI":"10.1109\/TMC.2021.3073969","article-title":"RF-based human activity recognition using signal adapted convolutional neural network","volume":"22","author":"Chen","year":"2021","journal-title":"IEEE Trans. Mob. Comput."},{"key":"10.1016\/j.knosys.2026.115548_bib0012","doi-asserted-by":"crossref","first-page":"10402","DOI":"10.1109\/JSEN.2025.3529889","article-title":"MSMFT: multi-stream multimodal factorised transformer for human activity recognition","volume":"25","author":"Zhou","year":"2025","journal-title":"IEEE Sens. J."},{"issue":"4","key":"10.1016\/j.knosys.2026.115548_bib0013","first-page":"1","article-title":"Temporal action localization for inertial-based human activity recognition","volume":"8","author":"Bock","year":"2024","journal-title":"Proc. ACM Interact. Mobile Wearable Ubiquitous Technol."},{"issue":"12","key":"10.1016\/j.knosys.2026.115548_bib0014","doi-asserted-by":"crossref","first-page":"9375","DOI":"10.1007\/s11760-024-03552-z","article-title":"Multi-modal hybrid hierarchical classification approach with transformers to enhance complex human activity recognition","volume":"18","author":"Ezzeldin","year":"2024","journal-title":"Signal Image Video Process."},{"key":"10.1016\/j.knosys.2026.115548_bib0015","doi-asserted-by":"crossref","first-page":"21","DOI":"10.1016\/j.engappai.2018.08.014","article-title":"A review of state-of-the-art techniques for abnormal human activity recognition","volume":"77","author":"Dhiman","year":"2019","journal-title":"Eng. Appl. Artif. Intell."},{"issue":"2","key":"10.1016\/j.knosys.2026.115548_bib0016","doi-asserted-by":"crossref","first-page":"877","DOI":"10.1007\/s00500-021-06238-7","article-title":"Skeleton-based human activity recognition using ConvLSTM and guided feature learning","volume":"26","author":"Yadav","year":"2022","journal-title":"Soft comput."},{"key":"10.1016\/j.knosys.2026.115548_bib0017","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2021.116424","article-title":"3D human action recognition: through the eyes of researchers","volume":"193","author":"Sarkar","year":"2022","journal-title":"Expert Syst. Appl."},{"issue":"1","key":"10.1016\/j.knosys.2026.115548_bib0018","doi-asserted-by":"crossref","first-page":"2","DOI":"10.1186\/s42490-025-00088-2","article-title":"A novel ViT-BILSTM model for physical activity intensity classification in adults using gravity-based acceleration","volume":"7","author":"Wang","year":"2025","journal-title":"BMC Biomed. Eng."},{"key":"10.1016\/j.knosys.2026.115548_bib0019","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112264","article-title":"ConvTransformer attention network for temporal action detection","volume":"300","author":"Cui","year":"2024","journal-title":"Knowl. Based Syst."},{"key":"10.1016\/j.knosys.2026.115548_bib0020","article-title":"Privacy-preserving human activity sensing: a survey","volume":"4","author":"Yang","year":"2024","journal-title":"High-Confid. Comput."},{"key":"10.1016\/j.knosys.2026.115548_bib0021","doi-asserted-by":"crossref","DOI":"10.1016\/j.engappai.2023.107171","article-title":"WiFi-based human activity recognition through wall using deep learning","volume":"127","author":"Abuhoureyah","year":"2024","journal-title":"Eng. Appl. Artif. Intell."},{"key":"10.1016\/j.knosys.2026.115548_bib0022","doi-asserted-by":"crossref","first-page":"12706","DOI":"10.1109\/TMC.2024.3420405","article-title":"SAT: a selective adversarial training approach for WiFi-based human activity recognition","volume":"23","author":"Pan","year":"2024","journal-title":"IEEE Trans. Mob. Comput."},{"key":"10.1016\/j.knosys.2026.115548_bib0023","doi-asserted-by":"crossref","first-page":"39020","DOI":"10.1109\/JIOT.2024.3400773","article-title":"An AIoT framework with multi-modal frequency fusion for WiFi-based coarse and fine activity recognition","volume":"11","author":"Chen","year":"2024","journal-title":"IEEE Internet Things J."},{"key":"10.1016\/j.knosys.2026.115548_bib0024","series-title":"Journal of Physics: Conference Series","first-page":"012139","article-title":"LSTM-CNN network for human activity recognition using WiFi CSI data","volume":"1883","author":"Shang","year":"2021"},{"key":"10.1016\/j.knosys.2026.115548_bib0025","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TIM.2023.3289547","article-title":"A WiFi-based method for recognizing fine-grained multiple-subject human activities","volume":"72","author":"Moghaddam","year":"2023","journal-title":"IEEE Trans. Instrum. Meas."},{"key":"10.1016\/j.knosys.2026.115548_bib0026","series-title":"International Conference on Big Data Analytics","first-page":"150","article-title":"IndoorGNN: a graph neural network based approach for indoor localization using WiFi RSSI","author":"Vishwakarma","year":"2023"},{"issue":"1","key":"10.1016\/j.knosys.2026.115548_bib0027","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3517241","article-title":"Towards robust gesture recognition by characterizing the sensing quality of WiFi signals","volume":"6","author":"Gao","year":"2022","journal-title":"Proc. ACM on Interact. Mobile Wearable Ubiquitous Technol."},{"key":"10.1016\/j.knosys.2026.115548_bib0028","article-title":"Non-contact multimodal indoor human monitoring systems: a survey","volume":"110","author":"Susarla","year":"2024","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.knosys.2026.115548_bib0029","unstructured":"X. Chen, J. Yang, X-Fi: a modality-invariant foundation model for multimodal human sensing, (2024). arXiv: 2410.10167."},{"issue":"16","key":"10.1016\/j.knosys.2026.115548_bib0030","doi-asserted-by":"crossref","first-page":"7292","DOI":"10.3390\/s23167292","article-title":"Human activity recognition via score level fusion of Wi-Fi CSI signals","volume":"23","author":"Lim","year":"2023","journal-title":"Sensors"},{"issue":"2","key":"10.1016\/j.knosys.2026.115548_bib0031","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3534588","article-title":"Rfcam: uncertainty-aware fusion of camera and wi-fi for real-time human identification with mobile devices","volume":"6","author":"Chen","year":"2022","journal-title":"Proc. ACM on Interact. Mobile Wearable Ubiquitous Technol."},{"issue":"10","key":"10.1016\/j.knosys.2026.115548_bib0032","doi-asserted-by":"crossref","first-page":"5734","DOI":"10.1109\/TCSVT.2023.3255832","article-title":"MAWKDN: a multimodal fusion wavelet knowledge distillation approach based on cross-view attention for action recognition","volume":"33","author":"Quan","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.115548_bib0033","doi-asserted-by":"crossref","first-page":"1699","DOI":"10.1109\/TIP.2024.3364022","article-title":"Disentangled cross-modal transformer for RGB-d salient object detection and beyond","volume":"33","author":"Chen","year":"2024","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.115548_bib0034","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.124846","article-title":"TDS-Net: transformer enhanced dual-stream network for video anomaly detection","volume":"256","author":"Hussain","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.knosys.2026.115548_bib0035","doi-asserted-by":"crossref","first-page":"14","DOI":"10.1016\/j.neucom.2020.11.074","article-title":"Improving human action recognition by jointly exploiting video and WiFi clues","volume":"458","author":"Guo","year":"2021","journal-title":"Neurocomputing"},{"issue":"3","key":"10.1016\/j.knosys.2026.115548_bib0036","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3310194","article-title":"WiFi sensing with channel state information: a survey","volume":"52","author":"Ma","year":"2019","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"10.1016\/j.knosys.2026.115548_bib0037","doi-asserted-by":"crossref","first-page":"55","DOI":"10.1109\/OJITS.2023.3336795","article-title":"Characterization and selection of WiFi channel state information features for human activity detection in a smart public transportation system","volume":"5","author":"Alizadeh","year":"2024","journal-title":"IEEE Open J. Int. Trans. Syst."},{"key":"10.1016\/j.knosys.2026.115548_bib0038","unstructured":"K. Kim, S.N. Gowda, O. Mac Aodha, L. Sevilla-Lara, Capturing temporal information in a single frame: channel sampling strategies for action recognition, (2022). arXiv: 2201.10394."},{"key":"10.1016\/j.knosys.2026.115548_bib0039","series-title":"2023 IEEE 20th International Conference on Mobile Ad Hoc and Smart Systems (MASS)","first-page":"406","article-title":"WiMix: a lightweight multimodal human activity recognition system based on WiFi and vision","author":"Chen","year":"2023"},{"key":"10.1016\/j.knosys.2026.115548_bib0040","doi-asserted-by":"crossref","first-page":"20933","DOI":"10.1109\/TITS.2024.3455416","article-title":"TransKD: transformer knowledge distillation for efficient semantic segmentation","volume":"25","author":"Liu","year":"2024","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.knosys.2026.115548_bib0041","doi-asserted-by":"crossref","first-page":"144","DOI":"10.1016\/j.patrec.2021.11.030","article-title":"PEDENet: image anomaly localization via patch embedding and density estimation","volume":"153","author":"Zhang","year":"2022","journal-title":"Pattern Recognit. Lett."},{"key":"10.1016\/j.knosys.2026.115548_bib0042","doi-asserted-by":"crossref","first-page":"13860","DOI":"10.1109\/TPAMI.2023.3298301","article-title":"HDGT: heterogeneous driving graph transformer for multi-agent trajectory prediction via scene encoding","volume":"45","author":"Jia","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.115548_bib0043","series-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: Student Research Workshop","first-page":"221","article-title":"How many layers and why? An analysis of the model depth in transformers","author":"Simoulin","year":"2021"},{"key":"10.1016\/j.knosys.2026.115548_bib0044","doi-asserted-by":"crossref","first-page":"474","DOI":"10.1109\/JSTARS.2020.3036602","article-title":"Self-supervised pretraining of transformers for satellite image time series classification","volume":"14","author":"Yuan","year":"2020","journal-title":"IEEE J. Sel. Top. Appl. Earth Obs. Remote Sens."},{"key":"10.1016\/j.knosys.2026.115548_bib0045","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Incrementally learning the hierarchical softmax function for neural language models","volume":"31","author":"Peng","year":"2017"},{"issue":"1","key":"10.1016\/j.knosys.2026.115548_bib0046","article-title":"Vision transformer and deep sequence learning for human activity recognition in surveillance videos","volume":"2022","author":"Hussain","year":"2022","journal-title":"Comput. Intell. Neurosci."},{"key":"10.1016\/j.knosys.2026.115548_bib0047","series-title":"Proceedings of the IEEE International Conference on Computer Vision Workshops","first-page":"3154","article-title":"Learning spatio-temporal features with 3d residual networks for action recognition","author":"Hara","year":"2017"},{"key":"10.1016\/j.knosys.2026.115548_bib0048","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"846","article-title":"Darklight networks for action recognition in the dark","author":"Chen","year":"2021"},{"key":"10.1016\/j.knosys.2026.115548_bib0049","series-title":"2021 11th International Conference on Computer Engineering and Knowledge (ICCKE)","first-page":"7","article-title":"CSI-based human activity recognition using convolutional neural networks","author":"Moshiri","year":"2021"},{"key":"10.1016\/j.knosys.2026.115548_bib0050","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"286","article-title":"Two-stream convolution augmented transformer for human activity recognition","volume":"35","author":"Li","year":"2021"},{"key":"10.1016\/j.knosys.2026.115548_bib0051","doi-asserted-by":"crossref","first-page":"80058","DOI":"10.1109\/ACCESS.2019.2923743","article-title":"Joint activity recognition and indoor localization with WiFi fingerprints","volume":"7","author":"Wang","year":"2019","journal-title":"IEEE Access"},{"key":"10.1016\/j.knosys.2026.115548_bib0052","unstructured":"J. Yang, S. Tang, Y. Xu, Y. Zhou, L. Xie, MaskFi: unsupervised learning of WiFi and vision representations for multimodal human activity recognition, (2024). arXiv: 2402.19258."},{"key":"10.1016\/j.knosys.2026.115548_bib0053","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops","first-page":"0","article-title":"WiFi and vision multimodal learning for accurate and robust device-free human activity recognition","author":"Zou","year":"2019"},{"key":"10.1016\/j.knosys.2026.115548_bib0054","series-title":"Proceedings of the 3rd ACM MobiCom Workshop on Integrated Sensing and Communications Systems","first-page":"25","article-title":"Towards pervasive sensing: a multimodal approach via CSI and RGB image modalities fusion","author":"Zhou","year":"2023"},{"key":"10.1016\/j.knosys.2026.115548_bib0055","series-title":"European Conference on Computer Vision","first-page":"72","article-title":"WIMans: a benchmark dataset for wifi-based multi-user activity sensing","author":"Huang","year":"2024"},{"key":"10.1016\/j.knosys.2026.115548_bib0056","first-page":"18756","article-title":"MM-FI: multi-modal non-intrusive 4D human dataset for versatile wireless sensing","volume":"36","author":"Yang","year":"2023","journal-title":"Adv. Neural Inf. Process. Syst."}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S095070512600290X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S095070512600290X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T21:28:37Z","timestamp":1773955717000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S095070512600290X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4]]},"references-count":56,"alternative-id":["S095070512600290X"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115548","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"WiVi-UF: Unified feature learning in cross-modal transformers with WiFi and vision data fusion for enhanced human activity recognition","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115548","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"115548"}}