{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,14]],"date-time":"2026-06-14T07:09:20Z","timestamp":1781420960336,"version":"3.54.1"},"reference-count":55,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"DOI":"10.13039\/100006190","name":"Telekom Malaysia Research and Development","doi-asserted-by":"publisher","award":["RDTC\/231075"],"award-info":[{"award-number":["RDTC\/231075"]}],"id":[{"id":"10.13039\/100006190","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006190","name":"Telekom Malaysia Research and Development","doi-asserted-by":"publisher","award":["RDTC\/231084"],"award-info":[{"award-number":["RDTC\/231084"]}],"id":[{"id":"10.13039\/100006190","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/access.2024.3403101","type":"journal-article","created":{"date-parts":[[2024,5,20]],"date-time":"2024-05-20T17:28:15Z","timestamp":1716226095000},"page":"71505-71519","source":"Crossref","is-referenced-by-count":5,"title":["UniRaG: Unification, Retrieval, and Generation for Multimodal Question Answering With Pre-Trained Language Models"],"prefix":"10.1109","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-5119-5754","authenticated-orcid":false,"given":"Qi","family":"Zhi Lim","sequence":"first","affiliation":[{"name":"Faculty of Information Science and Technology, Multimedia University, Melaka, Malaysia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3679-8977","authenticated-orcid":false,"given":"Chin","family":"Poo Lee","sequence":"additional","affiliation":[{"name":"Faculty of Information Science and Technology, Multimedia University, Melaka, Malaysia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1929-7978","authenticated-orcid":false,"given":"Kian","family":"Ming Lim","sequence":"additional","affiliation":[{"name":"Faculty of Information Science and Technology, Multimedia University, Melaka, Malaysia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ahmad","family":"Kamsani Samingan","sequence":"additional","affiliation":[{"name":"Telekom Research and Development Sdn. Bhd., Cyberjaya, Selangor, Malaysia"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6294"},{"key":"ref2","article-title":"MultiModalQA: Complex question answering over text, tables and images","author":"Talmor","year":"2021","journal-title":"arXiv:2104.06039"},{"key":"ref3","first-page":"1533","article-title":"Semantic parsing on freebase from question-answer pairs","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Berant"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1237"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P15-1142"},{"key":"ref7","article-title":"Tableqa: Question answering on tabular data","volume":"abs\/1705.06504","author":"Vakulenko","year":"2017","journal-title":"CoRR"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00446"},{"key":"ref9","first-page":"1","article-title":"A multi-world approach to question answering about real-world scenes based on uncertain input","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"27","author":"Malinowski"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.290"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.375"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611964"},{"key":"ref15","article-title":"Progressive evidence refinement for open-domain multimodal retrieval question answering","author":"Yang","year":"2023","journal-title":"arXiv:2310.09696"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.292"},{"key":"ref17","article-title":"MMHQA-ICL: Multimodal in-context learning for hybrid question answering over text, tables and images","author":"Liu","year":"2023","journal-title":"arXiv:2309.04790"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.626"},{"key":"ref19","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv:1810.04805"},{"issue":"140","key":"ref20","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref21","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown"},{"key":"ref22","article-title":"Improved baselines with visual instruction tuning","author":"Liu","year":"2023","journal-title":"arXiv:2310.03744"},{"key":"ref23","article-title":"Table-GPT: Table-tuned GPT for diverse table tasks","author":"Li","year":"2023","journal-title":"arXiv:2310.09263"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref25","article-title":"MS MARCO: A human generated machine reading comprehension dataset","author":"Bajaj","year":"2016","journal-title":"arXiv:1611.09268"},{"key":"ref26","first-page":"5776","article-title":"MiniLM: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Wang"},{"key":"ref27","article-title":"Scaling instruction-finetuned language models","author":"Chung","year":"2022","journal-title":"arXiv:2210.11416"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018876"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.91"},{"key":"ref31","article-title":"Open question answering over tables and text","author":"Chen","year":"2020","journal-title":"arXiv:2010.10439"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.254"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01600"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21370"},{"key":"ref35","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref36","first-page":"23318","article-title":"OFA: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref38","first-page":"26183","article-title":"You only look at one sequence: Rethinking transformer in vision through object detection","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Fang"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.115"},{"issue":"1","key":"ref40","first-page":"29","article-title":"Using TF-IDF to determine word relevance in document queries","volume-title":"Proc. 1st Instructional Conf. Mach. Learn.","volume":"242","author":"Ramos"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1561\/1500000019"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"ref43","article-title":"Pre-training tasks for embedding-based large-scale retrieval","author":"Chang","year":"2020","journal-title":"arXiv:2002.03932"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401075"},{"key":"ref45","article-title":"Passage re-ranking with BERT","author":"Nogueira","year":"2019","journal-title":"arXiv:1901.04085"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331317"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.63"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1907.11692"},{"key":"ref49","article-title":"DeBERTaV3: Improving DeBERTa using ELECTRA-style pre-training with gradient-disentangled embedding sharing","author":"He","year":"2021","journal-title":"arXiv:2111.09543"},{"key":"ref50","first-page":"1","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Liu"},{"key":"ref51","first-page":"1","article-title":"Judging LLM-as-a-judge with MT-bench and chatbot arena","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Zheng"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref53","article-title":"Binding language models in symbolic languages","author":"Cheng","year":"2022","journal-title":"arXiv:2210.02875"},{"key":"ref54","article-title":"Multimodal multi-hop question answering through a conversation between tools and efficiently finetuned large language models","author":"Rajabzadeh","year":"2023","journal-title":"arXiv:2309.08922"},{"key":"ref55","article-title":"Turning tables: Generating examples from semi-structured tables for endowing language models with reasoning skills","author":"Yoran","year":"2021","journal-title":"arXiv:2107.07261"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/10380310\/10535103.pdf?arnumber=10535103","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,5,24]],"date-time":"2024-05-24T17:29:23Z","timestamp":1716571763000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10535103\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":55,"URL":"https:\/\/doi.org\/10.1109\/access.2024.3403101","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}