{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,31]],"date-time":"2026-03-31T06:13:47Z","timestamp":1774937627606,"version":"3.50.1"},"reference-count":26,"publisher":"Springer Science and Business Media LLC","issue":"33","license":[{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T00:00:00Z","timestamp":1724112000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Comput &amp; Applic"],"published-print":{"date-parts":[[2024,11]]},"DOI":"10.1007\/s00521-024-10318-8","type":"journal-article","created":{"date-parts":[[2024,8,20]],"date-time":"2024-08-20T19:47:25Z","timestamp":1724183245000},"page":"20949-20962","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Multimodal fusion: advancing medical visual question-answering"],"prefix":"10.1007","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-9146-9030","authenticated-orcid":false,"given":"Anjali","family":"Mudgal","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-9503-3311","authenticated-orcid":false,"given":"Udbhav","family":"Kush","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7441-2222","authenticated-orcid":false,"given":"Aditya","family":"Kumar","sequence":"additional","affiliation":[]},{"given":"Amir","family":"Jafari","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,20]]},"reference":[{"key":"10318_CR1","unstructured":"Sahu T (2022) Visual question answering with multimodal transformers. https:\/\/medium.com\/data-science-at-microsoft\/visual-question-answering-with-multimodal-transformers-d4f57950c867"},{"key":"10318_CR2","unstructured":"Ben Abacha A, Hasan SA, Datla VV, Liu J, Demner-Fushman D, M\u00fcller H (2019) Vqa-med: overview of the medical visual question answering task at imageclef 2019. In: Working Notes of CLEF 2019. CEUR Workshop Proceedings, vol. 2380. CEUR-WS.org, Lugano, Switzerland. https:\/\/ceur-ws.org\/Vol-2380\/paper_272.pdf"},{"key":"10318_CR3","doi-asserted-by":"publisher","unstructured":"Lau JJ, Gayen S, Demner D, Ben\u00a0Abacha A (2019) Visual question answering in radiology (VQA-RAD). OSF. https:\/\/doi.org\/10.17605\/OSF.IO\/89KPS","DOI":"10.17605\/OSF.IO\/89KPS"},{"key":"10318_CR4","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2023) Attention is all you need"},{"key":"10318_CR5","doi-asserted-by":"crossref","unstructured":"Agrawal A, Lu J, Antol S, Mitchell M, Zitnick CL, Batra D, Parikh D (2016) VQA: visual question answering","DOI":"10.1007\/s11263-016-0966-6"},{"key":"10318_CR6","doi-asserted-by":"crossref","unstructured":"Zhang P, Goyal Y, Summers-Stay D, Batra D, Parikh D (2015) Yin and yang: Balancing and answering binary visual questions. CoRR arXiv:1511.05099","DOI":"10.1109\/CVPR.2016.542"},{"key":"10318_CR7","doi-asserted-by":"crossref","unstructured":"Zhu Y, Groth O, Bernstein M, Fei-Fei L (2016) Visual7W: Grounded Question Answering in Images","DOI":"10.1109\/CVPR.2016.540"},{"key":"10318_CR8","doi-asserted-by":"crossref","unstructured":"Yu L, Park E, Berg AC, Berg TL (2015) Visual madlibs: fill in the blank image generation and question answering","DOI":"10.1109\/ICCV.2015.283"},{"key":"10318_CR9","unstructured":"Gao H, Mao J, Zhou J, Huang Z, Wang L, Xu W (2015) Are you talking to a machine? Dataset and methods for multilingual image question answering"},{"key":"10318_CR10","doi-asserted-by":"publisher","unstructured":"Andreas J, Rohrbach M, Darrell T, Klein D (2016) Learning to compose neural networks for question answering. In: Knight K, Nenkova A., Rambow O (eds.) Proceedings of the 2016 conference of the North American Chapter of the association for computational linguistics: human language technologies, pp 1545\u20131554. Association for Computational Linguistics, San Diego, California. https:\/\/doi.org\/10.18653\/v1\/N16-1181 . https:\/\/aclanthology.org\/N16-1181","DOI":"10.18653\/v1\/N16-1181"},{"key":"10318_CR11","unstructured":"Chen K, Wang J, Chen L-C, Gao H, Xu W, Nevatia R (2016) ABC-CNN: an attention based convolutional neural network for visual question answering"},{"key":"10318_CR12","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhutdinov R, Zemel R, Bengio Y (2016) Show. Neural image caption generation with visual attention, attend and tell"},{"key":"10318_CR13","doi-asserted-by":"publisher","unstructured":"Pennington J, Socher R, Manning C (2014) GloVe: Global vectors for word representation. In: Moschitti A, Pang, B, Daelemans W (eds) Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp. 1532\u20131543. Association for computational linguistics, Doha, Qatar. https:\/\/doi.org\/10.3115\/v1\/D14-1162 . https:\/\/aclanthology.org\/D14-1162","DOI":"10.3115\/v1\/D14-1162"},{"issue":"8","key":"10318_CR14","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput. 9(8):1735\u20131780. https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Comput."},{"key":"10318_CR15","unstructured":"Li J, Li D, Xiong C, Hoi S (2022) BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation"},{"key":"10318_CR16","unstructured":"Li J (2022) BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. https:\/\/blog.salesforceairesearch.com\/blip-bootstrapping-language-image-pretraining\/"},{"key":"10318_CR17","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N (2021) An image is worth 16 x 16 words: transformers for image recognition at scale"},{"key":"10318_CR18","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2019) BERT: pre-training of deep bidirectional transformers for language understanding"},{"key":"10318_CR19","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li J, Selvaraju RR, Gotmare AD, Joty S, Xiong C, Hoi S (2021) Align before fuse: vision and language representation learning with momentum distillation. Adv Neural Inf Process Syst 34:9694\u20139705","journal-title":"Adv Neural Inf Process Syst"},{"key":"10318_CR20","unstructured":"Wang Z, Yu J, Yu AW, Dai Z, Tsvetkov Y, Cao Y (2022) SimVLM: simple visual language model pretraining with weak supervision"},{"key":"10318_CR21","doi-asserted-by":"crossref","unstructured":"Wolf T, Debut L, Sanh V, Chaumond J, Delangue C, Moi A, Cistac P, Rault T, Louf R, Funtowicz M, Davison J, Shleifer S, Platen P, Ma C, Jernite Y, Plu J, Xu C, Scao TL, Gugger S, Drame M, Lhoest Q, Rush AM (2020) HuggingFace\u2019s transformers: state-of-the-art natural language processing. arXiv:1910.03771","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"10318_CR22","unstructured":"Lin T-Y, Maire M, Belongie S, Bourdev L, Girshick R, Hays J, Perona P, Ramanan D, Zitnick CL, Doll\u00e1r P (2015) Microsoft COCO: Common Objects in Context. http:\/\/arxiv.org\/abs\/1405.0312arXiv:1405.0312"},{"key":"10318_CR23","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2016","unstructured":"Krishna R, Zhu Y, Groth O, Johnson J, Hata K, Kravitz J, Chen S, Kalantidis Y, Li L-J, Shamma DA, Bernstein MS, Li F-F (2016) Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vision 123:32\u201373","journal-title":"Int J Comput Vision"},{"key":"10318_CR24","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting on association for computational linguistics. ACL \u201902, pp 311\u2013318. Association for Computational Linguistics, USA. https:\/\/doi.org\/10.3115\/1073083.1073135","DOI":"10.3115\/1073083.1073135"},{"key":"10318_CR25","unstructured":"Lin C-Y (2004) ROUGE: A package for automatic evaluation of summaries. In: Text summarization branches out, pp 74\u201381. Association for computational linguistics, Barcelona, Spain. https:\/\/aclanthology.org\/W04-1013"},{"key":"10318_CR26","unstructured":"Loshchilov I, Hutter F (2019) Decoupled weight decay regularization"}],"container-title":["Neural Computing and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-10318-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00521-024-10318-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00521-024-10318-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,28]],"date-time":"2024-09-28T16:04:56Z","timestamp":1727539496000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00521-024-10318-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,20]]},"references-count":26,"journal-issue":{"issue":"33","published-print":{"date-parts":[[2024,11]]}},"alternative-id":["10318"],"URL":"https:\/\/doi.org\/10.1007\/s00521-024-10318-8","relation":{},"ISSN":["0941-0643","1433-3058"],"issn-type":[{"value":"0941-0643","type":"print"},{"value":"1433-3058","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,20]]},"assertion":[{"value":"7 May 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 July 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 August 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"I hereby certify that to the best of my knowledge, the authors have no relevant financial or non-financial interests to disclose. The authors have no conflict of interest to declare that is relevant to the content of this article. All authors certify that they have no affiliations with or involvement in any organization or entity with any financial interest or non-financial interest in the subject matter or materials discussed in this manuscript. The authors have no financial or proprietary interests in any material discussed in this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}