{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T20:17:01Z","timestamp":1774124221349,"version":"3.50.1"},"reference-count":53,"publisher":"Tsinghua University Press","issue":"2","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Big Data Min. Anal."],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.26599\/bdma.2024.9020079","type":"journal-article","created":{"date-parts":[[2025,1,28]],"date-time":"2025-01-28T18:53:58Z","timestamp":1738090438000},"page":"458-478","source":"Crossref","is-referenced-by-count":4,"title":["Seeing and Reasoning: A Simple Deep Learning Approach to Visual Question Answering"],"prefix":"10.26599","volume":"8","author":[{"given":"Rufai Yusuf","family":"Zakari","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"}]},{"given":"Jim Wilson","family":"Owusu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"}]},{"given":"Ke","family":"Qin","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"}]},{"given":"Tao","family":"He","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"}]},{"given":"Guangchun","family":"Luo","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Electronic Science and Technology of China,Chengdu,China,611731"}]}],"member":"11138","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref2","article-title":"AI-complete, AI-hard, or AI-easy: Classification of problems in artificial intelligence","volume-title":"Proc. 23rd Midwest Artificial Intelligence and Cognitive Science Conf.","author":"Yampolskiy"},{"key":"ref3","article-title":"VQA and visual reasoning: An overview of recent datasets, methods and challenges","author":"Zakari","year":"2022","journal-title":"arXiv preprint"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.417"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2023.104840"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-022-04355-w"},{"key":"ref9","volume-title":"Multimodal learning and reasoning for visual question answering","author":"Srisupavanich","year":"2020"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref11","article-title":"Show, ask, attend, and answer: A strong baseline for visual question answering","author":"Kazemi","year":"2017","journal-title":"arXiv preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_28"},{"key":"ref14","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","volume-title":"Proc. 28th Int. Conf. Neural Information Processing Systems","author":"Ren"},{"key":"ref15","article-title":"Compositional attention networks for machine reasoning","author":"Hudson","year":"2018","journal-title":"arXiv preprint"},{"key":"ref16","article-title":"The neuro-symbolic concept learner: Interpreting scenes, words, and sentences from natural supervision","author":"Mao","year":"2019","journal-title":"arXiv preprint"},{"key":"ref17","article-title":"Neural-symbolic computing: An effective methodology for principled integration of machine learning and reasoning","author":"d\u2019Avila Garcez","year":"2019","journal-title":"arXiv preprint"},{"key":"ref18","article-title":"Neural turing machines","author":"Graves","year":"2014","journal-title":"arXiv preprint"},{"key":"ref19","article-title":"End-to-end memory networks","author":"Sukhbaatar","year":"2015","journal-title":"arXiv preprint"},{"key":"ref20","article-title":"Relational inductive biases, deep learning, and graph networks","author":"Battaglia","year":"2018","journal-title":"arXiv preprint"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00729"},{"key":"ref22","article-title":"Learning by abstraction: The neural state machine","author":"Hudson","year":"2019","journal-title":"arXiv preprint"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.12"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.325"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.93"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref27","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. 33rd Int. Conf. Neural Information Processing Systems","author":"Lu"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref29","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019","journal-title":"arXiv preprint"},{"key":"ref30","first-page":"4974","article-title":"A simple neural network module for relational reasoning","volume-title":"Proc. 31 st Int. Conf. neural Information Processing Systems","author":"Santoro"},{"key":"ref31","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. 33rd Int. Conf. on Neural Information Processing Systems","author":"Paszke"},{"key":"ref32","volume-title":"MMF: A multimodal framework for vision and language research","author":"Singh","year":"2020"},{"key":"ref33","volume-title":"Openvqa","author":"Yu","year":"2019"},{"key":"ref34","article-title":"HuggingFace\u2019s transformers: State-of-the-art natural language processing","author":"Wolf","year":"2020","journal-title":"arXiv preprint"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2995278"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2972830"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3173131"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00208"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2021.02.006"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0277693"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00192"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00637"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3323873.3325044"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3316767"},{"issue":"1","key":"ref48","first-page":"012047","article-title":"Research on visual question answering based on deep stacked attention network","volume-title":"J. Phys.: Conf. Ser.","volume":"1873","author":"Zhu"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00209"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-022-13065-x"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-08790-0"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104165"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045336"}],"container-title":["Big Data Mining and Analytics"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/8254253\/10856852\/10856898.pdf?arnumber=10856898","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,29]],"date-time":"2025-01-29T19:16:28Z","timestamp":1738178188000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10856898\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4]]},"references-count":53,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.26599\/bdma.2024.9020079","relation":{},"ISSN":["2096-0654","2097-406X"],"issn-type":[{"value":"2096-0654","type":"print"},{"value":"2097-406X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4]]}}}