{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T16:16:59Z","timestamp":1769012219675,"version":"3.49.0"},"reference-count":63,"publisher":"Tech Science Press","issue":"3","license":[{"start":{"date-parts":[[2024,12,29]],"date-time":"2024-12-29T00:00:00Z","timestamp":1735430400000},"content-version":"vor","delay-in-days":363,"URL":"https:\/\/doi.org\/10.32604\/TSP-CROSSMARKPOLICY"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2024]]},"DOI":"10.32604\/cmc.2024.057453","type":"journal-article","created":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T07:10:27Z","timestamp":1732173027000},"page":"4195-4216","update-policy":"https:\/\/doi.org\/10.32604\/tsp-crossmarkpolicy","source":"Crossref","is-referenced-by-count":1,"title":["Adjusted Reasoning Module for Deep Visual Question Answering Using Vision Transformer"],"prefix":"10.32604","volume":"81","author":[{"given":"Stephen Abednego","family":"Philemon","sequence":"first","affiliation":[]},{"given":"Christian","family":"Adi Ananta","sequence":"additional","affiliation":[]},{"given":"Christine","family":"Dewi","sequence":"additional","affiliation":[]},{"given":"Hanna Prillysca","family":"Chernovita","sequence":"additional","affiliation":[]},{"given":"Abbott Po Shun","family":"Chen","sequence":"additional","affiliation":[]}],"member":"17807","published-online":{"date-parts":[[2024]]},"reference":[{"key":"ref1","doi-asserted-by":"crossref","first-page":"345","DOI":"10.1111\/mice.13086","article-title":"Improving visual question answering for bridge inspection by pre-training with external data of image-text pairs","volume":"39","author":"Kunlamai","year":"2024","journal-title":"Comput. Civ. Infrastruct. Eng."},{"key":"ref2","doi-asserted-by":"crossref","first-page":"163","DOI":"10.1561\/0600000105","article-title":"Vision-language pre-training: Basics, recent advances, and future trends","volume":"14","author":"Gan","year":"2022","journal-title":"Found Trends Comput. Graph. Vis."},{"key":"ref3","doi-asserted-by":"crossref","first-page":"21","DOI":"10.1016\/j.cviu.2017.05.001","article-title":"Visual question answering: A survey of methods and datasets","volume":"163","author":"Wu","year":"2017","journal-title":"Comput. Vis. Image Underst."},{"key":"ref4","doi-asserted-by":"crossref","first-page":"4138","DOI":"10.17762\/ijritcc.v11i9.9781","article-title":"Novel approach to integrate various feature extraction techniques for the visual question answering system with skeletal images in the healthcare sector","volume":"11","author":"Melvin","year":"2023","journal-title":"Int. J. Recent Innov. Trends Comput. Commun."},{"key":"ref5","doi-asserted-by":"crossref","first-page":"419","DOI":"10.5573\/IEIESPC.2023.12.5.419","article-title":"Application of a neural network-based visual question answering system in preschool language education","volume":"12","author":"Cheng","year":"2023","journal-title":"IEIE Trans. Smart Process. Comput."},{"key":"ref6","series-title":"NAACL-HLT 2021\u20132021 Conf. North Am. Chapter Assoc. Comput. Linguist.: Human Lang. Technol.","first-page":"1908","article-title":"Worldly Wise (WoW)-cross-lingual knowledge fusion for fact-based visual spoken-question answering","author":"Ramnath","year":"2021"},{"key":"ref7","series-title":"Proc. ACM SIGKDD Int. Conf. Knowl. Discov. Data Min.","first-page":"3262","article-title":"PAM: Understanding product images in cross product category attribute extraction","author":"Lin","year":"2021"},{"key":"ref8","doi-asserted-by":"crossref","DOI":"10.1007\/s44196-023-00233-6","article-title":"Multiscale feature extraction and fusion of image and text in VQA","volume":"16","author":"Lu","year":"2023, Art. no. 54","journal-title":"Int. J. Comput. Intell. Syst."},{"key":"ref9","series-title":"IEEE Conf. Intell. Transp. Syst.","first-page":"1207","article-title":"Explaining autonomous driving actions with visual question answering","author":"Atakishiyev","year":"2023"},{"key":"ref10","first-page":"1044","article-title":"TG-VQA: Ternary game of video question answering","volume":"2023","author":"Li","year":"2023","journal-title":"IJCAI Int. Joint Conf. Artif. Intell."},{"key":"ref11","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2023.122239","article-title":"Learning neighbor-enhanced region representations and question-guided visual representations for visual question answering","volume":"238","author":"Gao","year":"2024, Art. no. 122239","journal-title":"Expert. Syst. Appl."},{"key":"ref12","unstructured":"Z. Zhang, A. Zhang, M. Li, H. Zhao, G. Karypis and A. Smola, \u201cMultimodal chain-of-thought reasoning in language models,\u201d Feb. 2023. doi: 10.48550\/arXiv.2302.00923."},{"key":"ref13","doi-asserted-by":"crossref","first-page":"652","DOI":"10.1007\/978-3-031-19833-5_38","article-title":"Weakly supervised grounding for VQA in vision-language transformers","volume":"13695","author":"Khan","year":"2022","journal-title":"Lecture Notes in Computer Science"},{"key":"ref14","doi-asserted-by":"crossref","first-page":"451","DOI":"10.1016\/j.neucom.2021.08.117","article-title":"Multi visual and textual embedding on visual question answering for blind people","volume":"465","author":"Le","year":"2021","journal-title":"Neurocomputing"},{"key":"ref15","doi-asserted-by":"crossref","DOI":"10.3390\/app13179735","article-title":"An effective Med-VQA method using a transformer with weights fusion of multiple fine-tuned models","volume":"13","author":"Al-Hadhrami","year":"2023, Art. no. 9735","journal-title":"Appl. Sci."},{"key":"ref16","doi-asserted-by":"crossref","first-page":"20967","DOI":"10.1007\/s10489-023-04564-x","article-title":"ST-VQA: Shrinkage transformer with accurate alignment for visual question answering","volume":"53","author":"Xia","year":"2023","journal-title":"Appl. Intell."},{"key":"ref17","doi-asserted-by":"crossref","first-page":"111","DOI":"10.4114\/intartif.vol27iss73pp111-128","article-title":"TRANS-VQA: Fully transformer-based image question-answering model using question-guided vision attention","volume":"27","author":"Koshti","year":"2024","journal-title":"Intel. Artif."},{"key":"ref18","doi-asserted-by":"crossref","DOI":"10.3389\/frai.2023.1023281","article-title":"COVID-Twitter-BERT: A natural language processing model to analyse COVID-19 content on Twitter","volume":"6","author":"M\u00fcller","year":"2023, Art. no. 411","journal-title":"Front. Artif. Intell."},{"key":"ref19","doi-asserted-by":"crossref","DOI":"10.3390\/s23010506","article-title":"A BERT framework to sentiment analysis of tweets","volume":"23","author":"Bello","year":"2023, Art. no. 506","journal-title":"Sensors"},{"key":"ref20","doi-asserted-by":"crossref","DOI":"10.1186\/s40537-022-00564-9","article-title":"The performance of BERT as data representation of text clustering","volume":"9","author":"Subakti","year":"2022, Art. no. 9","journal-title":"J. Big Data"},{"key":"ref21","doi-asserted-by":"crossref","first-page":"1","DOI":"10.3390\/app13063915","article-title":"RoBERTa-GRU: A hybrid deep learning model for enhanced sentiment analysis","volume":"13","author":"Tan","year":"2023","journal-title":"Appl. Sci."},{"key":"ref22","unstructured":"Y. Liu et al., \u201cRoBERTa: A robustly optimized BERT pretraining approach,\u201d Jul. 2019. doi: 10.48550\/arXiv.1907.11692."},{"key":"ref23","doi-asserted-by":"crossref","DOI":"10.3390\/electronics12204263","article-title":"Enhancing fashion classification with Vision Transformer (ViT) and developing recommendation fashion systems using DINOVA2","volume":"12","author":"Abd Alaziz","year":"2023, Art. no. 4263","journal-title":"Electronics"},{"key":"ref24","doi-asserted-by":"crossref","first-page":"15845","DOI":"10.1109\/ACCESS.2024.3357946","article-title":"Exploiting data-efficient image transformer-based transfer learning for valvular heart diseases detection","volume":"12","author":"Jumphoo","year":"2024","journal-title":"IEEE Access"},{"key":"ref25","unstructured":"H. Bao, L. Dong, S. Piao, and F. Wei, \u201cBEIT: BERT pre-training of image transformers,\u201d 2022. doi: 10.48550\/arXiv.2106.08254."},{"key":"ref26","series-title":"Proc. IEEE Comput. Soc. Conf. Comput. Vis. Pattern Recognit.","first-page":"21","article-title":"Stacked attention networks for image question answering","volume":"2016","author":"Yang","year":"2016"},{"key":"ref27","series-title":"8th Int. Conf. Learn. Rep., ICLR","first-page":"1","article-title":"Clevrer: Collision events for video representation and reasoning","author":"Yi","year":"2020"},{"key":"ref28","first-page":"1031","article-title":"Neural-symbolic VQA: Disentangling reasoning from vision and language understanding","volume":"2018","author":"Yi","year":"2018","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"ref29","doi-asserted-by":"crossref","DOI":"10.1186\/s12880-022-00800-x","article-title":"BPI-MVQA: A bi-branch model for medical visual question answering","volume":"22","author":"Liu","year":"2022, Art. no. 6799","journal-title":"BMC Med. Imaging"},{"key":"ref30","doi-asserted-by":"crossref","first-page":"336","DOI":"10.1007\/s11263-019-01228-7","article-title":"Grad-CAM: Visual explanations from deep networks via gradient-based localization","volume":"128","author":"Selvaraju","year":"2020","journal-title":"Int. J. Comput. Vis."},{"key":"ref31","doi-asserted-by":"crossref","DOI":"10.7717\/peerj-cs.353","article-title":"Joint embedding VQA model based on dynamic word vector","volume":"7","author":"Ma","year":"2021, Art. no. e353","journal-title":"PeerJ Comput. Sci."},{"key":"ref32","doi-asserted-by":"crossref","first-page":"5374","DOI":"10.1109\/TKDE.2023.3332929","article-title":"Spatial-temporal interval aware individual future trajectory prediction","volume":"36","author":"Jiang","year":"2023","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"ref33","series-title":"NIPS\u201915: Proc. 28th Int. Conf. Neural Inform. Process. Syst.","first-page":"2953","article-title":"Exploring models and data for image question answering","author":"Ren","year":"May 2015"},{"key":"ref34","series-title":"COLING 2020-28th Int. Conf. Comput. Linguist.","first-page":"5205","article-title":"Explain by evidence: An explainable memory-based neural network for question answering","author":"Tran","year":"2020"},{"key":"ref35","series-title":"Proc. Int. Joint Conf. Neural Netw.","first-page":"1","article-title":"Component analysis for visual question answering architectures","author":"Kolling","year":"2020"},{"key":"ref36","doi-asserted-by":"crossref","first-page":"1","DOI":"10.7717\/peerj-cs.1400","article-title":"The multi-modal fusion in visual question answering: A review of attention mechanisms","volume":"9","author":"Lu","year":"2023","journal-title":"PeerJ Comput. Sci."},{"key":"ref37","doi-asserted-by":"crossref","first-page":"424","DOI":"10.1016\/j.inffus.2022.09.025","article-title":"Multimodal sentiment analysis: A systematic review of history, datasets, multimodal fusion methods, applications, challenges and future directions","volume":"91","author":"Gandhi","year":"2023","journal-title":"Inf. Fusion"},{"key":"ref38","unstructured":"L. B. Jimmy, R. K. Jamie, and G. E. Hinton, \u201cLayer normalization,\u201d 2016. doi: 10.48550\/arXiv.1607.06450."},{"key":"ref39","first-page":"4381","article-title":"Understanding and improving layer normalization","volume":"32","author":"Xu","year":"2019","journal-title":"Proc. 33rd Int. Conf. Neural Inform. Process. Syst."},{"key":"ref40","doi-asserted-by":"crossref","first-page":"3471","DOI":"10.1007\/s11760-024-03013-7","article-title":"A focus fusion attention mechanism integrated with image captions for knowledge graph-based visual question answering","volume":"18","author":"Ma","year":"2024","journal-title":"Signal Image Video Process."},{"key":"ref41","unstructured":"C. Kervadec, G. Antipov, M. Baccouche, and C. Wolf, \u201cEstimating semantic structure for the VQA answer space,\u201d 2020, arXiv2006.05726."},{"key":"ref42","doi-asserted-by":"crossref","unstructured":"\u00d6. \u00d6zdemir and E. Akag\u00fcnd\u00fcz, \u201cEnhancing visual question answering through question-driven image captions as prompts,\u201d Apr. 2024. doi: 10.1109\/CVPRW63382.2024.00163.","DOI":"10.1109\/CVPRW63382.2024.00163"},{"key":"ref43","first-page":"42","article-title":"A modification of Wu and Palmer semantic similarity measure","volume":"1","author":"Guessoum","year":"2016","journal-title":"Tenth Int. Conf. Mobile Ubiquitous Comput., Syst., Serv. Technol., UBICOMM 2016 Tenth Int. Conf. Mob. Ubiquitous Comput. Syst. Serv. Technol."},{"key":"ref44","doi-asserted-by":"crossref","unstructured":"M. A. Pratama and R. Mandala, \u201cImproving query expansion performances with pseudo relevance feedback and wu-palmer similarity on cross language information retrieval,\u201d 2022. doi: 10.1109\/ICAICTA56449.2022.9932984.","DOI":"10.1109\/ICAICTA56449.2022.9932984"},{"key":"ref45","doi-asserted-by":"crossref","DOI":"10.3390\/bioengineering10030380","article-title":"Vision-language model for visual question answering in medical imagery","volume":"10","author":"Bazi","year":"Mar. 2023, Art. no. 380","journal-title":"Bioengineering"},{"key":"ref46","doi-asserted-by":"crossref","first-page":"3279","DOI":"10.1109\/JSTARS.2023.3261361","article-title":"Visual question generation from remote sensing images","volume":"16","author":"Bashmal","year":"2023","journal-title":"IEEE J. Sel. Top. Appl. Earth Obs. Remote Sens."},{"key":"ref47","first-page":"711","article-title":"XAI for image captioning using SHAP","volume":"39","author":"Dewi","year":"2023","journal-title":"J. Inf. Sci. Eng."},{"key":"ref48","doi-asserted-by":"crossref","DOI":"10.1016\/j.autcon.2022.104580","article-title":"Safety compliance checking of construction behaviors using visual question answering","volume":"144","author":"Ding","year":"2022, Art. no. 104580","journal-title":"Autom. Constr."},{"key":"ref49","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TGRS.2022.3192460","article-title":"Bi-modal transformer-based approach for visual question answering in remote sensing imagery","volume":"60","author":"Bazi","year":"2022","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"ref50","series-title":"AAAI 2020-34th AAAI Conf. Artif. Intell.","first-page":"13041","article-title":"Unified vision-language pre-training for image captioning and VQA","author":"Zhou","year":"2020"},{"key":"ref51","doi-asserted-by":"crossref","first-page":"32897","DOI":"10.1007\/s11042-020-09509-x","article-title":"Weight analysis for various prohibitory sign detection and recognition using deep learning","volume":"79","author":"Dewi","year":"2020","journal-title":"Multimed. Tools Appl."},{"key":"ref52","doi-asserted-by":"crossref","first-page":"1681","DOI":"10.1109\/JBHI.2022.3163751","article-title":"Vision-language transformer for interpretable pathology visual question answering","volume":"27","author":"Naseem","year":"2023","journal-title":"IEEE J. Biomed. Heal. Inform."},{"key":"ref53","doi-asserted-by":"crossref","first-page":"1886","DOI":"10.1109\/JBHI.2023.3294249","article-title":"K-PathVQA: Knowledge-aware multimodal representation for pathology visual question answering","volume":"28","author":"Naseem","year":"2024","journal-title":"IEEE J. Biomed. Heal. Inform."},{"key":"ref54","doi-asserted-by":"crossref","first-page":"437","DOI":"10.1613\/jair.1.13113","article-title":"Image captioning as an assistive technology: Lessons learned from VizWiz 2020 challenge","volume":"73","author":"Dognin","year":"2022","journal-title":"J. Artif. Intell. Res."},{"key":"ref55","series-title":"Proc. IEEE Comput. Soc. Conf. Comput. Vis. Pattern Recognit.","first-page":"3608","article-title":"VizWiz grand challenge: Answering visual questions from blind people","author":"Gurari","year":"2018"},{"key":"ref56","doi-asserted-by":"crossref","first-page":"726","DOI":"10.1007\/978-3-031-43904-9_70","author":"van Sonsbeek","year":"2023","journal-title":"Lecture Notes in Computer Science"},{"key":"ref57","series-title":"Proc. IEEE Comput. Soc. Conf. Comput. Vis. Pattern Recognit.","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson","year":"2018"},{"key":"ref58","doi-asserted-by":"crossref","first-page":"522","DOI":"10.1007\/978-3-030-32251-9_57","article-title":"Overcoming data limitation in medical visual question answering","author":"Nguyen","year":"2019","journal-title":"Lecture Notes in Computer Science"},{"key":"ref59","doi-asserted-by":"crossref","first-page":"64","DOI":"10.1007\/978-3-030-87240-3_7","article-title":"Multiple meta-model quantifying for medical visual question answering","author":"Do","year":"2021","journal-title":"Lecture Notes in Computer Science"},{"key":"ref60","doi-asserted-by":"crossref","first-page":"3332","DOI":"10.1109\/TMI.2022.3185008","article-title":"VQAMix: Conditional triplet mixup for medical visual question answering","volume":"41","author":"Gong","year":"2022","journal-title":"IEEE Trans. Med. Imaging"},{"key":"ref61","article-title":"AMAM: An attention-based multimodal alignment model for medical visual question answering","volume":"255","author":"Pan","year":"Nov. 2022, Art. no. 109763","journal-title":"Knowl.-Based Syst."},{"key":"ref62","doi-asserted-by":"crossref","first-page":"398","DOI":"10.1007\/s11263-018-1116-0","article-title":"Making the V in VQA matter: Elevating the role of image understanding in visual question answering","volume":"127","author":"Goyal","year":"Apr. 2019","journal-title":"Int. J. Comput. Vis."},{"key":"ref63","unstructured":"V. Kazemi and A. Elqursh, \u201cShow, ask, attend, and answer: A strong baseline for visual question answering,\u201d Apr. 2017. doi: 10.48550\/arXiv.1704.03162."}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.techscience.com\/cmc\/v81n3\/59043\/pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,7]],"date-time":"2025-03-07T06:07:13Z","timestamp":1741327633000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v81n3\/59043"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":63,"journal-issue":{"issue":"3","published-online":{"date-parts":[[2024]]},"published-print":{"date-parts":[[2024]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2024.057453","relation":{},"ISSN":["1546-2226"],"issn-type":[{"value":"1546-2226","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"2024-08-18","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-11-01","order":1,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2024-12-19","order":2,"name":"published","label":"Published Online","group":{"name":"publication_history","label":"Publication History"}}]}}