{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T14:55:56Z","timestamp":1773413756496,"version":"3.50.1"},"reference-count":113,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2024,2,1]],"date-time":"2024-02-01T00:00:00Z","timestamp":1706745600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,2,1]],"date-time":"2024-02-01T00:00:00Z","timestamp":1706745600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,2,1]],"date-time":"2024-02-01T00:00:00Z","timestamp":1706745600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62303361"],"award-info":[{"award-number":["62303361"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62302161"],"award-info":[{"award-number":["62302161"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1811463"],"award-info":[{"award-number":["U1811463"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004761","name":"Hainan Provincial Natural Science Foundation of China","doi-asserted-by":"publisher","award":["623QN266"],"award-info":[{"award-number":["623QN266"]}],"id":[{"id":"10.13039\/501100004761","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["WUT: 233110002"],"award-info":[{"award-number":["WUT: 233110002"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012152","name":"China National Postdoctoral Program for Innovative Talents","doi-asserted-by":"publisher","award":["BX20230114"],"award-info":[{"award-number":["BX20230114"]}],"id":[{"id":"10.13039\/501100012152","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2018AAA0101502"],"award-info":[{"award-number":["2018AAA0101502"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Syst. Man Cybern, Syst."],"published-print":{"date-parts":[[2024,2]]},"DOI":"10.1109\/tsmc.2023.3319964","type":"journal-article","created":{"date-parts":[[2023,10,17]],"date-time":"2023-10-17T18:02:28Z","timestamp":1697565748000},"page":"854-865","source":"Crossref","is-referenced-by-count":2,"title":["So Many Heads, So Many Wits: Multimodal Graph Reasoning for Text-Based Visual Question Answering"],"prefix":"10.1109","volume":"54","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9732-3217","authenticated-orcid":false,"given":"Wenbo","family":"Zheng","sequence":"first","affiliation":[{"name":"School of Computer Science and Artificial Intelligence, Wuhan University of Technology, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6452-9649","authenticated-orcid":false,"given":"Lan","family":"Yan","sequence":"additional","affiliation":[{"name":"College of Computer Science and Engineering, Hunan University, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9185-3989","authenticated-orcid":false,"given":"Fei-Yue","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Management and Control of Complex Systems, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3017530"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2019.2931003"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2016.2639788"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2016.2605132"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.44"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-0152-y"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.3029423"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.07.009"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00439"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.278"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3120194"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58545-7_41"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01276"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413924"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108455"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3276570"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-26316-3_39"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109337"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859865"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859603"},{"key":"ref23","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"30","author":"Vaswani"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1580"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01102"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3197238"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01007"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475620"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3132034"},{"key":"ref30","first-page":"1","article-title":"Semi-supervised classification with graph convolutional networks","volume-title":"Proc. Int. Conf. Learn. Rep. (ICLR)","author":"Kipf"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.109250"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.13646"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.11688"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413826"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01081"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01039"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_36"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467285"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2020.10.007"},{"key":"ref40","first-page":"12747","article-title":"Two heads are better than one: Hypergraph-enhanced graph reasoning for visual event ratiocination","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","volume":"139","author":"Zheng"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01246"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-63820-7_22"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i6.16629"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3383465"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.10.009"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.05.015"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413892"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICME46284.2020.9102872"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2023.3282704"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2023.3236776"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2022.3228849"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-022-10166-9"},{"key":"ref53","first-page":"4","article-title":"Spectral networks and locally connected networks on graphs","volume-title":"Proc. Int. Conf. Learn. Rep. (ICLR)","author":"Bruna"},{"key":"ref54","first-page":"2224","article-title":"Convolutional networks on graphs for learning molecular fingerprints","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst. (NIPS)","author":"Duvenaud"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11604"},{"key":"ref56","first-page":"2765","article-title":"Heterogeneous graph learning for visual commonsense reasoning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yu"},{"key":"ref57","volume-title":"Transformers are Graph Neural Networks","author":"Joshi","year":"2020"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00406"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref60","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"28","author":"Ren"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219861"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00051"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2339814"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00052"},{"key":"ref65","first-page":"1853","article-title":"Symbolic graph reasoning meets convolutions","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Liang"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00068"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_20"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_12"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3019644"},{"key":"ref71","author":"Goodfellow","year":"2016","journal-title":"Deep Learning"},{"key":"ref72","first-page":"13","article-title":"Disentangling trainability and generalization in deep neural networks","volume-title":"Proc. 37th Int. Conf. Mach. Learn.","volume":"119","author":"Xiao"},{"key":"ref73","first-page":"1","article-title":"Unsupervised neural machine translation","volume-title":"Proc. Int. Conf. Learn. Rep.","author":"Artetxe"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1549"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.96"},{"key":"ref76","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"27","author":"Sutskever"},{"key":"ref77","volume-title":"Statistical Natural Language Processing","author":"Zong","year":"2013"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"ref80","first-page":"2582","article-title":"Infographicvqa","volume-title":"Proc. IEEE\/CVF Winter Conf. Appl. Comput. Vis. (WACV)","author":"Mathew"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86337-1_42"},{"key":"ref82","first-page":"1","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Rep. (ICLR)","author":"Kingma"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref84","volume-title":"Roberta: A robustly optimized BERT pretraining approach","author":"Liu","year":"2019"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/423"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.07.010"},{"key":"ref87","volume-title":"Google OCR","year":"2023"},{"key":"ref88","volume-title":"Microsoft OCR","year":"2023"},{"key":"ref89","volume-title":"Amazon OCR","year":"2023"},{"key":"ref90","volume-title":"iFLYTEK OCR","year":"2023"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16476"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475425"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475606"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00297"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86365-4_25"},{"key":"ref96","first-page":"1","article-title":"Graph relation transformer: Incorporating pairwise object features into the transformer architecture","volume-title":"Proc. CVPR Visual Workshop","author":"Yang"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01605"},{"key":"ref98","article-title":"GIT: A generative image-to-text transformer for vision and language","author":"Wang","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICME52920.2022.9859666"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00864"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref102","first-page":"1","article-title":"COCO-text: Dataset and benchmark for text detection and recognition in natural images","volume-title":"Proc. CVPR","author":"Veit"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1145\/1866029.1866080"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2013.221"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.378"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00251"},{"key":"ref108","article-title":"Document visual question answering challenge 2020","author":"Mathew","year":"2020","journal-title":"arXiv:2008.08899"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.201"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86331-9_47"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16431"},{"key":"ref113","author":"Hong","year":"2021","journal-title":"BROS: A pre-trained language model for understanding texts in document"}],"container-title":["IEEE Transactions on Systems, Man, and Cybernetics: Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6221021\/10403398\/10286555.pdf?arnumber=10286555","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,19]],"date-time":"2024-01-19T18:26:26Z","timestamp":1705688786000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10286555\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,2]]},"references-count":113,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tsmc.2023.3319964","relation":{},"ISSN":["2168-2216","2168-2232"],"issn-type":[{"value":"2168-2216","type":"print"},{"value":"2168-2232","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,2]]}}}