{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T03:33:51Z","timestamp":1777952031706,"version":"3.51.4"},"reference-count":30,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T00:00:00Z","timestamp":1769817600000},"content-version":"vor","delay-in-days":30,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Procedia Computer Science"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1016\/j.procs.2026.01.055","type":"journal-article","created":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T19:30:19Z","timestamp":1774035019000},"page":"455-463","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Modular Arabic VQA System Using Pre-Trained Models"],"prefix":"10.1016","volume":"275","author":[{"given":"Shahad","family":"Alshalawi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Amal","family":"Almansour","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Amani","family":"Jamal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.procs.2026.01.055_bib1","doi-asserted-by":"crossref","unstructured":"Antol, S., et al. Vqa: Visual question answering. in Proceedings of the IEEE international conference on computer vision. 2015.","DOI":"10.1109\/ICCV.2015.279"},{"key":"10.1016\/j.procs.2026.01.055_bib2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al. Bottom-up and top-down attention for image captioning and visual question answering. in Proceedings of the IEEE conference on computer vision and pattern recognition. 2018.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"10.1016\/j.procs.2026.01.055_bib3","doi-asserted-by":"crossref","unstructured":"Nguyen, D.-K. and T. Okatani. Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering. in Proceedings of the IEEE conference on computer vision and pattern recognition. 2018.","DOI":"10.1109\/CVPR.2018.00637"},{"key":"10.1016\/j.procs.2026.01.055_bib4","unstructured":"Kim, J.-H., J. Jun, and B.-T. Zhang, Bilinear attention networks. Advances in neural information processing systems, 2018. 31."},{"key":"10.1016\/j.procs.2026.01.055_bib5","doi-asserted-by":"crossref","unstructured":"Yu, Z., et al. Deep modular co-attention networks for visual question answering. in Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2019.","DOI":"10.1109\/CVPR.2019.00644"},{"key":"10.1016\/j.procs.2026.01.055_bib6","series-title":"Unifying vision-and-language tasks via text generation. in International Conference on Machine Learning.","author":"Cho","year":"2021"},{"key":"10.1016\/j.procs.2026.01.055_bib7","doi-asserted-by":"crossref","unstructured":"Li, G., et al. Unicoder-vl: A universal encoder for vision and language by cross-modal pre-training. in Proceedings of the AAAI conference on artificial intelligence. 2020.","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"10.1016\/j.procs.2026.01.055_bib8","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"10.1016\/j.procs.2026.01.055_bib9","unstructured":"Li, L.H., et al., Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557, 2019."},{"key":"10.1016\/j.procs.2026.01.055_bib10","unstructured":"Lu, J., et al., Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems, 2019. 32."},{"key":"10.1016\/j.procs.2026.01.055_bib11","doi-asserted-by":"crossref","unstructured":"Thobhani, A., et al., A Survey on Enhancing Image Captioning with Advanced Strategies and Techniques. Computer Modeling in Engineering & Sciences (CMES), 2025. 142(3).","DOI":"10.32604\/cmes.2025.059192"},{"key":"10.1016\/j.procs.2026.01.055_bib12","doi-asserted-by":"crossref","unstructured":"Alghyaline, S., Arabic Optical Character Recognition: A Review. Computer Modeling in Engineering & Sciences (CMES), 2023. 135(3).","DOI":"10.32604\/cmes.2022.024555"},{"issue":"8","key":"10.1016\/j.procs.2026.01.055_bib13","doi-asserted-by":"crossref","first-page":"10803","DOI":"10.1007\/s13369-023-07687-y","article-title":"Vaqa: Visual arabic question answering","volume":"48","author":"Kamel","year":"2023","journal-title":"Arabian Journal for Science and engineering"},{"key":"10.1016\/j.procs.2026.01.055_bib14","series-title":"ArabicQuest: Enhancing Arabic Visual Question Answering with LLM Fine-Tuning. in 2024 Intelligent Methods, Systems, and Applications (IMSA).","author":"ElMaghraby","year":"2024"},{"key":"10.1016\/j.procs.2026.01.055_bib15","doi-asserted-by":"crossref","unstructured":"Ghaboura, S., et al., Camel-bench: A comprehensive arabic lmm benchmark. arXiv preprint arXiv:2410.18976, 2024.","DOI":"10.18653\/v1\/2025.findings-naacl.105"},{"key":"10.1016\/j.procs.2026.01.055_bib16","series-title":"Microsoft coco: Common objects in context. in European conference on computer vision.","author":"Lin","year":"2014"},{"key":"10.1016\/j.procs.2026.01.055_bib17","doi-asserted-by":"crossref","unstructured":"Khashabi, D., et al., Unifiedqa: Crossing format boundaries with a single qa system. arXiv preprint arXiv:2005.00700, 2020.","DOI":"10.18653\/v1\/2020.findings-emnlp.171"},{"key":"10.1016\/j.procs.2026.01.055_bib18","doi-asserted-by":"crossref","unstructured":"Tiong, A.M.H., et al., Plug-and-play vqa: Zero-shot vqa by conjoining large pretrained models with zero training. arXiv preprint arXiv:2210.08773, 2022.","DOI":"10.18653\/v1\/2022.findings-emnlp.67"},{"key":"10.1016\/j.procs.2026.01.055_bib19","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al. An empirical study of gpt-3 for few-shot knowledge-based vqa. in Proceedings of the AAAI conference on artificial intelligence. 2022.","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"10.1016\/j.procs.2026.01.055_bib20","doi-asserted-by":"crossref","unstructured":"Guo, J., et al. From images to textual prompts: Zero-shot visual question answering with frozen large language models. in Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2023.","DOI":"10.1109\/CVPR52729.2023.01046"},{"key":"10.1016\/j.procs.2026.01.055_bib21","unstructured":"Chen, J., et al., Plug-and-play grounding of reasoning in multimodal large language models. arXiv preprint arXiv:2403.19322, 2024."},{"key":"10.1016\/j.procs.2026.01.055_bib22","doi-asserted-by":"crossref","unstructured":"Emami, J., Arabic image captioning using pre-training of deep bidirectional transformers. LU-CS-EX, 2022.","DOI":"10.18653\/v1\/2022.inlg-main.4"},{"key":"10.1016\/j.procs.2026.01.055_bib23","doi-asserted-by":"crossref","unstructured":"Mohamed, A., et al., Violet: A vision-language model for Arabic image captioning with gemini decoder. arXiv preprint arXiv:2311.08844, 2023.","DOI":"10.18653\/v1\/2023.arabicnlp-1.1"},{"key":"10.1016\/j.procs.2026.01.055_bib24","unstructured":"Achiam, J., et al., Gpt-4 technical report. arXiv preprint arXiv:2303.08774, 2023."},{"key":"10.1016\/j.procs.2026.01.055_bib25","unstructured":"Team, G., et al., Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805, 2023."},{"key":"10.1016\/j.procs.2026.01.055_bib26","unstructured":"Zhang, T., et al., Bertscore: Evaluating text generation with bert. arXiv preprint arXiv:1904.09675, 2019."},{"key":"10.1016\/j.procs.2026.01.055_bib27","doi-asserted-by":"crossref","unstructured":"Goyal, Y., et al. Making the v in vqa matter: Elevating the role of image understanding in visual question answering. in Proceedings of the IEEE conference on computer vision and pattern recognition. 2017.","DOI":"10.1109\/CVPR.2017.670"},{"key":"10.1016\/j.procs.2026.01.055_bib28","doi-asserted-by":"crossref","unstructured":"Marino, K., et al. Ok-vqa: A visual question answering benchmark requiring external knowledge. in Proceedings of the IEEE\/cvf conference on computer vision and pattern recognition. 2019.","DOI":"10.1109\/CVPR.2019.00331"},{"key":"10.1016\/j.procs.2026.01.055_bib29","doi-asserted-by":"crossref","unstructured":"Ranasinghe, T., C. Orasan, and R. Mitkov, TransQuest: Translation quality estimation with cross-lingual transformers. arXiv preprint arXiv:2011.01536, 2020.","DOI":"10.18653\/v1\/2020.coling-main.445"},{"key":"10.1016\/j.procs.2026.01.055_bib30","doi-asserted-by":"crossref","unstructured":"Papineni, K., et al. Bleu: a method for automatic evaluation of machine translation. in Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 2002.","DOI":"10.3115\/1073083.1073135"}],"container-title":["Procedia Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1877050926000554?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1877050926000554?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T11:18:57Z","timestamp":1777893537000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1877050926000554"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":30,"alternative-id":["S1877050926000554"],"URL":"https:\/\/doi.org\/10.1016\/j.procs.2026.01.055","relation":{},"ISSN":["1877-0509"],"issn-type":[{"value":"1877-0509","type":"print"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Modular Arabic VQA System Using Pre-Trained Models","name":"articletitle","label":"Article Title"},{"value":"Procedia Computer Science","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.procs.2026.01.055","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Author(s). Published by Elsevier B.V.","name":"copyright","label":"Copyright"}]}}