{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T17:18:05Z","timestamp":1778779085400,"version":"3.51.4"},"reference-count":59,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62303299"],"award-info":[{"award-number":["62303299"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.knosys.2026.115987","type":"journal-article","created":{"date-parts":[[2026,4,12]],"date-time":"2026-04-12T17:37:01Z","timestamp":1776015421000},"page":"115987","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["VTGCN: Graph-based structural representation learning for visual-tactile multimodal fusion"],"prefix":"10.1016","volume":"343","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2104-3989","authenticated-orcid":false,"given":"Liang","family":"Li","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7175-1221","authenticated-orcid":false,"given":"Guochu","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0002-5998-270X","authenticated-orcid":false,"given":"Manliang","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Baojiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Haiyan","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6917-7949","authenticated-orcid":false,"given":"Bin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Zizhen","family":"Yi","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.115987_bib0001","doi-asserted-by":"crossref","first-page":"313","DOI":"10.1016\/j.inffus.2022.11.032","article-title":"Fusion of tactile and visual information in deep learning models for object recognition","volume":"92","author":"Babadian","year":"2023","journal-title":"Inf. Fusion"},{"issue":"7","key":"10.1016\/j.knosys.2026.115987_bib0002","doi-asserted-by":"crossref","first-page":"4349","DOI":"10.1109\/TSMC.2021.3096235","article-title":"Partial visual-tactile fused learning for robotic object recognition","volume":"52","author":"Zhang","year":"2021","journal-title":"IEEE Trans. Syst., Man, Cybernet.: Syst."},{"key":"10.1016\/j.knosys.2026.115987_bib0003","article-title":"TVT-Transformer: a tactile-visual-textual fusion network for object recognition","author":"Li","year":"2025","journal-title":"Inf. Fusion"},{"issue":"4","key":"10.1016\/j.knosys.2026.115987_bib0004","doi-asserted-by":"crossref","first-page":"6321","DOI":"10.1109\/LRA.2021.3093871","article-title":"Gem: glare or gloom, i can still see you\u2013end-to-end multi-modal object detection","volume":"6","author":"Mazhar","year":"2021","journal-title":"IEEe Robot. Autom. Lett."},{"key":"10.1016\/j.knosys.2026.115987_bib0005","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.128436","article-title":"Deep learning for 3D object recognition: a survey","author":"Muzahid","year":"2024","journal-title":"Neurocomputing."},{"issue":"12","key":"10.1016\/j.knosys.2026.115987_bib0006","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s10462-024-10941-w","article-title":"Deep models for multi-view 3D object recognition: a review","volume":"57","author":"Alzahrani","year":"2024","journal-title":"Artif. Intell. Rev."},{"key":"10.1016\/j.knosys.2026.115987_bib0007","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Weakly supervised monocular 3D detection with a single-view image","author":"Jiang","year":"2024"},{"key":"10.1016\/j.knosys.2026.115987_bib0008","series-title":"2022 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","article-title":"MOTSLAM: mOT-assisted monocular dynamic SLAM using single-view depth estimation","author":"Zhang","year":"2022"},{"key":"10.1016\/j.knosys.2026.115987_bib0009","series-title":"2024 IEEE Haptics Symposium (HAPTICS)","article-title":"Toward spatial-temporal consistency of joint visual-tactile perception in VR applications","author":"Zhao","year":"2024"},{"issue":"14","key":"10.1016\/j.knosys.2026.115987_bib0010","doi-asserted-by":"crossref","first-page":"14574","DOI":"10.1109\/JSEN.2022.3175153","article-title":"The impact of data augmentation on tactile-based object classification using deep learning approach","volume":"22","author":"Maus","year":"2022","journal-title":"IEEe Sens. J."},{"issue":"7","key":"10.1016\/j.knosys.2026.115987_bib0011","doi-asserted-by":"crossref","DOI":"10.1002\/aisy.202200371","article-title":"Machine learning for tactile perception: advancements, challenges, and opportunities","volume":"5","author":"Hu","year":"2023","journal-title":"Adv. Intell. Syst."},{"issue":"1","key":"10.1016\/j.knosys.2026.115987_bib0012","doi-asserted-by":"crossref","first-page":"2","DOI":"10.1038\/s41528-023-00289-6","article-title":"A soft magnetoelectric finger for robots\u2019 multidirectional tactile perception in non-visual recognition environments","volume":"8","author":"Xu","year":"2024","journal-title":"Npj. Flex. Electron."},{"issue":"10","key":"10.1016\/j.knosys.2026.115987_bib0013","doi-asserted-by":"crossref","first-page":"12113","DOI":"10.1109\/TPAMI.2023.3275156","article-title":"Multimodal learning with transformers: a survey","volume":"45","author":"Xu","year":"2023","journal-title":"IEEe Trans. Pattern. Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.115987_bib0014","doi-asserted-by":"crossref","DOI":"10.1109\/TIM.2023.3326241","article-title":"VITO-transformer: a visual-tactile fusion network for object recognition","author":"Li","year":"2023","journal-title":"IEEe Trans. Instrum. Meas."},{"issue":"4","key":"10.1016\/j.knosys.2026.115987_bib0015","doi-asserted-by":"crossref","first-page":"875","DOI":"10.1109\/TMRB.2022.3215749","article-title":"Review of bioinspired vision-tactile fusion perception (VTFP): from humans to humanoids","volume":"4","author":"He","year":"2022","journal-title":"IEEe Trans. Med. Robot. Bionics."},{"issue":"1","key":"10.1016\/j.knosys.2026.115987_bib0016","doi-asserted-by":"crossref","DOI":"10.1038\/s41598-024-79981-0","article-title":"An adaptive multi-graph neural network with multimodal feature fusion learning for MDD detection","volume":"14","author":"Xing","year":"2024","journal-title":"Sci. Rep."},{"issue":"1","key":"10.1016\/j.knosys.2026.115987_bib0017","doi-asserted-by":"crossref","first-page":"6871","DOI":"10.1038\/s41467-024-51261-5","article-title":"Multimodal tactile sensing fused with vision for dexterous robotic housekeeping","volume":"15","author":"Mao","year":"2024","journal-title":"Nat. Commun."},{"key":"10.1016\/j.knosys.2026.115987_bib0018","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"BEV-guided multi-modality fusion for driving perception","author":"Man","year":"2023"},{"issue":"19","key":"10.1016\/j.knosys.2026.115987_bib0019","first-page":"6077","article-title":"MIUIC: a human-computer collaborative multimodal intention-understanding algorithm incorporating comfort analysis","volume":"40","author":"Zhou","year":"2024","journal-title":"Int. J. Hum.\u2013Comput. Inter."},{"key":"10.1016\/j.knosys.2026.115987_bib0020","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Binding touch to everything: learning unified multimodal tactile representations","author":"Yang","year":"2024"},{"key":"10.1016\/j.knosys.2026.115987_bib0021","series-title":"2024 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","article-title":"A case study on visual-audio-tactile cross-modal retrieval","author":"Wojcik","year":"2024"},{"key":"10.1016\/j.knosys.2026.115987_bib0022","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.knosys.2026.115987_bib0023","series-title":"Proceedings of the IEEE\/CVF conference on Computer Vision and Pattern Recognition","article-title":"A convnet for the 2020s","author":"Liu","year":"2022"},{"key":"10.1016\/j.knosys.2026.115987_bib0024","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Convnext v2: co-designing and scaling convnets with masked autoencoders","author":"Woo","year":"2023"},{"key":"10.1016\/j.knosys.2026.115987_bib0025","unstructured":"Alexey, D.\"An image is worth 16x16 words: transformers for image recognition at scale.\" arXiv preprint arXiv: 2010.11929 (2020)."},{"key":"10.1016\/j.knosys.2026.115987_bib0026","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","article-title":"Pyramid vision transformer: a versatile backbone for dense prediction without convolutions","author":"Wang","year":"2021"},{"key":"10.1016\/j.knosys.2026.115987_bib0027","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","article-title":"Swin transformer: hierarchical vision transformer using shifted windows","author":"Liu","year":"2021"},{"key":"10.1016\/j.knosys.2026.115987_bib0028","series-title":"Proceedings of the IEEE\/CVF conference on Computer Vision and Pattern Recognition","article-title":"Slide-transformer: hierarchical vision transformer with local self-attention","author":"Pan","year":"2023"},{"key":"10.1016\/j.knosys.2026.115987_bib0029","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"You only need less attention at each stage in vision transformers","author":"Zhang","year":"2024"},{"issue":"1","key":"10.1016\/j.knosys.2026.115987_bib0030","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TRO.2009.2033627","article-title":"Tactile sensing\u2014From humans to humanoids","volume":"26","author":"Dahiya","year":"2009","journal-title":"IEEE Trans. Robot."},{"key":"10.1016\/j.knosys.2026.115987_bib0031","doi-asserted-by":"crossref","DOI":"10.1016\/j.measurement.2024.115332","article-title":"Tactile sensors: a review","author":"Meribout","year":"2024","journal-title":"Measurement"},{"issue":"1","key":"10.1016\/j.knosys.2026.115987_bib0032","first-page":"16","article-title":"Bioinspired passive tactile sensors enabled by reversible polarization of conjugated polymers","volume":"17","author":"He","year":"2025","journal-title":"Nanomicro Lett."},{"key":"10.1016\/j.knosys.2026.115987_bib0033","first-page":"1","article-title":"Tactile perception information recognition of prosthetic hand based on dnn-lstm","volume":"71","author":"Bai","year":"2022","journal-title":"IEEe Trans. Instrum. Meas."},{"key":"10.1016\/j.knosys.2026.115987_bib0034","series-title":"2024 IEEE International Conference on Robotics and Automation (ICRA)","article-title":"X-Tacformer: spatio-tempral attention model for Tactile recognition","author":"Hu","year":"2024"},{"issue":"2","key":"10.1016\/j.knosys.2026.115987_bib0035","doi-asserted-by":"crossref","first-page":"1151","DOI":"10.1109\/LRA.2023.3236884","article-title":"Visuo-tactile feedback-based robot manipulation for object packing","volume":"8","author":"Liang","year":"2023","journal-title":"IEEe Robot. Autom. Lett."},{"key":"10.1016\/j.knosys.2026.115987_bib0036","series-title":"2022 International Conference on Robotics and Automation (ICRA)","article-title":"Shapemap 3-d: efficient shape mapping through dense touch and vision","author":"Suresh","year":"2022"},{"key":"10.1016\/j.knosys.2026.115987_bib0037","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2023.126427","article-title":"GraphMFT: a graph network based multimodal fusion technique for emotion recognition in conversation","volume":"550","author":"Li","year":"2023","journal-title":"Neurocomputing."},{"issue":"9","key":"10.1016\/j.knosys.2026.115987_bib0038","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3649447","article-title":"Deep Multimodal Data Fusion","volume":"56","author":"Zhao","year":"2024","journal-title":"ACM. Comput. Surv."},{"key":"10.1016\/j.knosys.2026.115987_bib0039","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2022.109250","article-title":"Graph convolutional networks in language and vision: a survey","volume":"251","author":"Ren","year":"2022","journal-title":"Knowl. Based. Syst."},{"issue":"4","key":"10.1016\/j.knosys.2026.115987_bib0040","doi-asserted-by":"crossref","DOI":"10.1016\/j.ipm.2022.102946","article-title":"Contrastive graph convolutional networks with adaptive augmentation for text classification","volume":"59","author":"Yang","year":"2022","journal-title":"Inf. Process. Manage"},{"key":"10.1016\/j.knosys.2026.115987_bib0041","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Actional-structural graph convolutional networks for skeleton-based action recognition","author":"Li","year":"2019"},{"key":"10.1016\/j.knosys.2026.115987_bib0042","doi-asserted-by":"crossref","first-page":"4471","DOI":"10.1109\/TMM.2021.3118881","article-title":"I-GCN: incremental graph convolution network for conversation emotion detection","volume":"24","author":"Nie","year":"2021","journal-title":"IEEE Trans. Multimedia"},{"key":"10.1016\/j.knosys.2026.115987_bib0043","series-title":"Proceedings of the 31st ACM International Conference on Multimedia","article-title":"Learning a graph neural network with cross modality interaction for image fusion","author":"Li","year":"2023"},{"key":"10.1016\/j.knosys.2026.115987_bib0044","doi-asserted-by":"crossref","first-page":"296","DOI":"10.1016\/j.patcog.2019.06.013","article-title":"Graph-based multimodal fusion with metric learning for multimodal classification","volume":"95","author":"Angelou","year":"2019","journal-title":"Pattern. Recognit."},{"issue":"2","key":"10.1016\/j.knosys.2026.115987_bib0045","doi-asserted-by":"crossref","first-page":"818","DOI":"10.1109\/TCSVT.2022.3206865","article-title":"Lifelong visual-tactile spectral clustering for robotic object perception","volume":"33","author":"Liu","year":"2022","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.115987_bib0046","series-title":"International Conference on Machine Learning","article-title":"Graphnorm: a principled approach to accelerating graph neural network training","author":"Cai","year":"2021"},{"key":"10.1016\/j.knosys.2026.115987_bib0047","author":"Yang","year":"2022","journal-title":"Touch and go: Learning from human-collected vision and touch"},{"key":"10.1016\/j.knosys.2026.115987_bib0048","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Objectfolder 2.0: a multisensory object dataset for sim2real transfer","author":"Gao","year":"2022"},{"key":"10.1016\/j.knosys.2026.115987_bib0049","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"The objectfolder benchmark: multisensory learning with neural and real objects","author":"Gao","year":"2023"},{"key":"10.1016\/j.knosys.2026.115987_bib0050","series-title":"International Conference on Machine Learning","article-title":"Efficientnetv2: smaller models and faster training","author":"Tan","year":"2021"},{"key":"10.1016\/j.knosys.2026.115987_bib0051","first-page":"22614","article-title":"Revisiting resnets: improved training and scaling strategies","volume":"34","author":"Bello","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.115987_bib0052","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"A convnet for the 2020s","author":"Liu","year":"2022"},{"key":"10.1016\/j.knosys.2026.115987_bib0053","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","article-title":"Shunted self-attention via multi-scale token aggregation","author":"Ren","year":"2022"},{"key":"10.1016\/j.knosys.2026.115987_bib0054","first-page":"8291","article-title":"Vision gnn: an image is worth graph of nodes","volume":"35","author":"Han","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.knosys.2026.115987_bib0055","series-title":"2018 IEEE International Conference on Robotics and Automation (ICRA)","article-title":"Slip detection with combined tactile and visual information","author":"Li","year":"2018"},{"issue":"16","key":"10.1016\/j.knosys.2026.115987_bib0056","doi-asserted-by":"crossref","first-page":"6872","DOI":"10.1109\/JSEN.2019.2912968","article-title":"CNN-based methods for object recognition with high-resolution tactile sensors","volume":"19","author":"Gandarias","year":"2019","journal-title":"IEEe Sens. J."},{"key":"10.1016\/j.knosys.2026.115987_bib0057","series-title":"2020 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","article-title":"Deep gated multi-modal learning: in-hand object pose changes estimation using tactile and image data","author":"Anzai","year":"2020"},{"key":"10.1016\/j.knosys.2026.115987_bib0058","doi-asserted-by":"crossref","unstructured":"Dave, V., F. Lygerakis, and E. Rueckert. \"Multimodal visual-tactile representation learning through self-supervised contrastive pre-training.\" arXiv preprint arXiv:2401.12024 (2024).","DOI":"10.1109\/ICRA57147.2024.10610228"},{"issue":"11","key":"10.1016\/j.knosys.2026.115987_bib0059","doi-asserted-by":"crossref","first-page":"12275","DOI":"10.1109\/TCYB.2021.3080321","article-title":"Visual\u2013tactile fused graph learning for object clustering","volume":"52","author":"Zhang","year":"2021","journal-title":"IEEe Trans. Cybern."}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126007136?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126007136?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T17:03:50Z","timestamp":1778778230000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126007136"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":59,"alternative-id":["S0950705126007136"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115987","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"VTGCN: Graph-based structural representation learning for visual-tactile multimodal fusion","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115987","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"115987"}}