{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,7]],"date-time":"2026-01-07T06:14:44Z","timestamp":1767766484953,"version":"3.48.0"},"reference-count":21,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,7,12]],"date-time":"2025-07-12T00:00:00Z","timestamp":1752278400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,12]],"date-time":"2025-07-12T00:00:00Z","timestamp":1752278400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100006190","name":"Research and Development","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006190","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003213","name":"Beijing Municipal Education Commission","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003213","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,7,12]]},"DOI":"10.1109\/indin64977.2025.11279055","type":"proceedings-article","created":{"date-parts":[[2026,1,6]],"date-time":"2026-01-06T18:33:35Z","timestamp":1767724415000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["GTNet: A Graph-Based Transformer with Cross-Modal Attention Mechanism"],"prefix":"10.1109","author":[{"given":"Qi","family":"Yao","sequence":"first","affiliation":[{"name":"Beijing University of Technology,School of Information Science and Technology,Beijing,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongyan","family":"Yang","sequence":"additional","affiliation":[{"name":"Beijing University of Technology,School of Information Science and Technology,Beijing,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huisheng","family":"Ma","sequence":"additional","affiliation":[{"name":"The 15th Research Institute of China Electronics Technology Group Corporation,Beijing,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"article-title":"Deep multimodal learning with missing modality: A survey[J]","year":"2024","author":"Wu","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICOEI.2019.8862698"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/s42979-024-03091-x"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2738401"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.433"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00332"},{"article-title":"Leverage points in modality shifts: Comparing language-only and multimodal word representations[J]","year":"2023","author":"Tikhonov","key":"ref8"},{"article-title":"Graph neural network for spatiotemporal data: methods and applications[J]","year":"2023","author":"Li","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2024.112866"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-024-00833-7"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TIE.2019.2962437"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2020.2985159"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN55064.2022.9891887"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2020.3005405"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.499"},{"article-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","year":"2020","author":"Dosovitskiy","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"article-title":". VL-BERT: Pre-training of Generic Visual-Linguistic Representations","year":"2019","author":"Su","key":"ref19"},{"author":"Lu","key":"ref20","article-title":"ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"}],"event":{"name":"2025 IEEE 23rd International Conference on Industrial Informatics (INDIN)","start":{"date-parts":[[2025,7,12]]},"location":"Kunming, China","end":{"date-parts":[[2025,7,15]]}},"container-title":["2025 IEEE 23rd International Conference on Industrial Informatics (INDIN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11278897\/11278845\/11279055.pdf?arnumber=11279055","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,7]],"date-time":"2026-01-07T06:08:08Z","timestamp":1767766088000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11279055\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,12]]},"references-count":21,"URL":"https:\/\/doi.org\/10.1109\/indin64977.2025.11279055","relation":{},"subject":[],"published":{"date-parts":[[2025,7,12]]}}}