{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T15:32:16Z","timestamp":1773243136953,"version":"3.50.1"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T00:00:00Z","timestamp":1773100800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T00:00:00Z","timestamp":1773100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Key Technologies R&D Program of Henan Province","award":["241111210700"],"award-info":[{"award-number":["241111210700"]}]},{"name":"Henan Provincial Science and Technology Key Project","award":["252102210073"],"award-info":[{"award-number":["252102210073"]}]},{"name":"e Key Research Project of Higher Education Institutions of Henan Province","award":["25A520017"],"award-info":[{"award-number":["25A520017"]}]},{"name":"Open Research Projects of National Engineering Research Center of Advanced Network Technologies","award":["ANT2024001"],"award-info":[{"award-number":["ANT2024001"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Pattern Anal Applic"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1007\/s10044-026-01644-9","type":"journal-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T19:54:50Z","timestamp":1773172490000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Medical dynamic feature enhanced multimodal fusion for medical visual question answering"],"prefix":"10.1007","volume":"29","author":[{"given":"Yuan","family":"Qu","sequence":"first","affiliation":[]},{"given":"Qingtao","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Jia","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Meiwen","family":"Li","sequence":"additional","affiliation":[]},{"given":"Huimin","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Meiyi","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,3,10]]},"reference":[{"key":"1644_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.artmed.2023.102611","volume":"143","author":"Z Lin","year":"2023","unstructured":"Lin Z, Zhang D, Tao Q et al (2023) Medical visual question answering: a survey. Artif Intell Med 143:102611","journal-title":"Artif Intell Med"},{"key":"1644_CR2","unstructured":"Huynh ND, Bouadjenek MR, Aryal S et al (2025) Visual question answering: from early developments to recent advances - a survey. ArXiv preprint arXiv:2501.03939"},{"key":"1644_CR3","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2023.105049","volume":"85","author":"Z Chen","year":"2023","unstructured":"Chen Z, Zou B, Dai Y et al (2023) Medical visual question answering with symmetric interaction attention and cross-modal gating. Biomed Signal Process Control 85:105049","journal-title":"Biomed Signal Process Control"},{"issue":"1","key":"1644_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/sdata.2018.251","volume":"5","author":"JJ Lau","year":"2018","unstructured":"Lau JJ, Gayen S, Ben Abacha A et al (2018) A dataset of clinically generated visual questions and answers about radiology images. Sci Data 5(1):1\u201310","journal-title":"Sci Data"},{"issue":"9","key":"1644_CR5","doi-asserted-by":"publisher","first-page":"2856","DOI":"10.1109\/TMI.2020.2978284","volume":"39","author":"MH Vu","year":"2020","unstructured":"Vu MH, L\u00f6fstedt T, Nyholm T et al (2020) A question-centric model for visual question answering in medical imaging. IEEE Trans Med Imaging 39(9):2856\u20132868","journal-title":"IEEE Trans Med Imaging"},{"key":"1644_CR6","doi-asserted-by":"crossref","unstructured":"Gong H, Chen G, Liu S, et al (2021) Cross-modal self-attention with multi-task pre-training for medical visual question answering. In: Proceedings of the 2021 International Conference on Multimedia Retrieval (ICMR), pp 456\u2013460","DOI":"10.1145\/3460426.3463584"},{"key":"1644_CR7","doi-asserted-by":"crossref","unstructured":"Zhan LM, Liu B, Fan L et al (2020) Medical visual question answering via conditional reasoning. In: Proceedings of the 28th International Conference on Multimedia (ACM), pp 2345\u20132354","DOI":"10.1145\/3394171.3413761"},{"key":"1644_CR8","unstructured":"Li C, Wong C, Zhang S, et al (2023) Llava-med: Training a large language-and-vision assistant for biomedicine in one day. In: Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"1644_CR9","unstructured":"Thai TM, Vo AT, Tieu HK, et al (2023) Uit-saviors at medvqa-gi 2023: Improving multimodal learning with image enhancement for gastrointestinal visual question answering. In: Working Notes of the Conference and Labs of the Evaluation Forum (CLEF), pp 1571\u20131587"},{"key":"1644_CR10","doi-asserted-by":"crossref","unstructured":"Li P, Liu G, He J et al (2023) Masked vision and language pre-training with unimodal and multimodal contrastive losses for medical visual question answering. In: Proceedings of the 26th International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI), pp 374\u2013383","DOI":"10.1007\/978-3-031-43907-0_36"},{"key":"1644_CR11","doi-asserted-by":"crossref","unstructured":"Chen J, Yang D, Jiang Y et al (2024) Miss: a generative pre-training and fine-tuning approach for med-VQA. In: International Conference on Artificial Neural Networks (ICANN), Springer, pp 299\u2013313","DOI":"10.1007\/978-3-031-72353-7_22"},{"key":"1644_CR12","unstructured":"Abacha AB, Gayen S, Lau JJ, et al (2018) Nlm at imageclef 2018 visual question answering in the medical domain. In: Conference and Labs of the Evaluation Forum (CLEF)"},{"key":"1644_CR13","unstructured":"Ben Abacha A, Hasan SA, Datla VV et al (2019) Vqa-med: Overview of the medical visual question answering task at imageclef 2019. In: Conference and Labs of the Evaluation Forum (CLEF)"},{"key":"1644_CR14","unstructured":"Simonyan K, Zisserman A (2015) Very deep convolutional networks for large-scale image recognition. In: Proceedings of the 3rd International Conference on Learning Representations (ICLR)"},{"key":"1644_CR15","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S et al (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"1644_CR16","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"1644_CR17","doi-asserted-by":"crossref","unstructured":"Cho K, van Merrienboer B, Gulcehre C et al (2014) Learning phrase representations using rnn encoder-decoder for statistical machine translation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp 1724\u20131734","DOI":"10.3115\/v1\/D14-1179"},{"key":"1644_CR18","doi-asserted-by":"crossref","unstructured":"Xia Z, Li H, Lan L (2025) Medformer: Hierarchical medical vision transformer with content-aware dual sparse selection attention. ArXiv preprint arXiv:2507.02488","DOI":"10.1088\/1361-6560\/ae07a1"},{"key":"1644_CR19","doi-asserted-by":"crossref","unstructured":"Pan H, He S, Zhang K et al (2021) Muvam: a multi-view attention-based model for medical visual question answering. ArXiv preprint arXiv:2107.03216","DOI":"10.1016\/j.knosys.2022.109763"},{"key":"1644_CR20","doi-asserted-by":"crossref","unstructured":"Wang Z, Wu Z, Agarwal D et al (2022) Medclip: Contrastive learning from unpaired medical images and text. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp 3876\u20133887","DOI":"10.18653\/v1\/2022.emnlp-main.256"},{"key":"1644_CR21","unstructured":"Gu A, Johnson I, Goel K et al (2021) Combining recurrent, convolutional, and continuous-time models with linear state space layers. In: Advances in Neural Information Processing Systems (NeurIPS), pp 572\u2013585"},{"issue":"8","key":"1644_CR22","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1109\/MC.2025.3571322","volume":"58","author":"A Salam","year":"2025","unstructured":"Salam A, Mahmud R, Islam T et al (2025) A comprehensive survey on mamba: architectures, challenges, and opportunities. Computer 58(8):64\u201376","journal-title":"Computer"},{"key":"1644_CR23","unstructured":"Gu A, Goel K, Re C (2022) Efficiently modeling long sequences with structured state spaces. In: The 10th International Conference on Learning Representations (ICLR)"},{"key":"1644_CR24","unstructured":"Huang PK, Ni HY, Ni Y et al (2022) Learnable descriptive convolutional network for face anti-spoofing. In: Proceedings of the 33rd British Machine Vision Conference (BMVA), p 239"},{"key":"1644_CR25","unstructured":"Fu DY, Dao T, Saab KK et al (2023) Hungry hungry hippos: Towards language modeling with state space models. In: The Eleventh International Conference on Learning Representations (ICLR)"},{"key":"1644_CR26","unstructured":"Gu A, Dao T (2023) Mamba: Linear-time sequence modeling with selective state spaces. ArXiv preprint arXiv:2312.00752"},{"key":"1644_CR27","unstructured":"Yue Y, Li Z (2024) Medmamba: Vision mamba for medical image classification. ArXiv preprint arXiv:2403.03849"},{"issue":"1","key":"1644_CR28","doi-asserted-by":"publisher","first-page":"37","DOI":"10.1007\/s44267-024-00072-9","volume":"2","author":"X Xie","year":"2024","unstructured":"Xie X, Cui Y, Tan T et al (2024) Fusionmamba: dynamic feature enhancement for multimodal image fusion with mamba. Vis Intell 2(1):37","journal-title":"Vis Intell"},{"key":"1644_CR29","doi-asserted-by":"crossref","unstructured":"Zhang M, Yu Y, Jin S et al (2024) Vm-unet-v2: Rethinking vision mamba unet for medical image segmentation. In: International Symposium on Bioinformatics Research and Applications (ISBRA), pp 335\u2013346","DOI":"10.1007\/978-981-97-5128-0_27"},{"key":"1644_CR30","unstructured":"Ma J, Li F, Wang B (2024) U-mamba: Enhancing long-range dependency for biomedical image segmentation. ArXiv preprint arXiv:2401.04722"},{"key":"1644_CR31","unstructured":"Li J, Li D, Savarese S et al (2023) Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning (ICML), pp 19730\u201319742"},{"key":"1644_CR32","doi-asserted-by":"crossref","unstructured":"Chen Z, Du Y, Hu J et al (2022) Multi-modal masked autoencoders for medical vision-and-language pre-training. In: Proceedings of the 25th International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI), pp 679\u2013689","DOI":"10.1007\/978-3-031-16443-9_65"},{"issue":"10","key":"1644_CR33","doi-asserted-by":"publisher","first-page":"5413","DOI":"10.1109\/TFUZZ.2024.3402086","volume":"32","author":"Y Liu","year":"2024","unstructured":"Liu Y, Chen B, Wang S et al (2024) Deep fuzzy multiteacher distillation network for medical visual question answering. IEEE Trans Fuzzy Syst 32(10):5413\u20135427","journal-title":"IEEE Trans Fuzzy Syst"},{"key":"1644_CR34","doi-asserted-by":"crossref","unstructured":"He K, Fan H, Wu Y et al (2020) Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 9726\u20139735","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"1644_CR35","doi-asserted-by":"publisher","DOI":"10.1016\/j.jbi.2024.104748","volume":"160","author":"G Liu","year":"2024","unstructured":"Liu G, He J, Li P et al (2024) Cross-modal self-supervised vision-language pre-training with multiple objectives for medical visual question answering. J Biomed Inform 160:104748","journal-title":"J Biomed Inform"},{"issue":"1","key":"1644_CR36","doi-asserted-by":"publisher","first-page":"688","DOI":"10.1038\/s41597-024-03496-6","volume":"11","author":"J R\u00fcckert","year":"2024","unstructured":"R\u00fcckert J, Bloch L, Br\u00fcngel R et al (2024) Rocov2: radiology objects in context version 2, an updated multimodal image dataset. Sci Data 11(1):688","journal-title":"Sci Data"},{"key":"1644_CR37","unstructured":"Li J, Selvaraju R, Gotmare A et al (2021) Align before fuse: Vision and language representation learning with momentum distillation. In: Advances in Neural Information Processing Systems (NeurIPS), pp 9694\u20139705"},{"key":"1644_CR38","doi-asserted-by":"crossref","unstructured":"He X, Cai Z, Wei W et al (2021) Towards visual question answering on pathology images. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (ACL\/IJCNLP), pp 708\u2013718","DOI":"10.18653\/v1\/2021.acl-short.90"},{"key":"1644_CR39","doi-asserted-by":"crossref","unstructured":"Liu B, Zhan LM, Xu L et al (2021) Slake: A semantically-labeled knowledge-enhanced dataset for medical visual question answering. In: Proceedings of the 18th International Symposium on Biomedical Imaging (ISBI), pp 1650\u20131654","DOI":"10.1109\/ISBI48211.2021.9434010"},{"key":"1644_CR40","unstructured":"Jones KN, Woode DE, Panizzi K et al (2001) Peir digital library: Online resources and authoring system. In: American Medical Informatics Association Annual Symposium (AMIA)"},{"key":"1644_CR41","doi-asserted-by":"crossref","unstructured":"Cubuk ED, Zoph B, Shlens J et al (2020) Randaugment: Practical automated data augmentation with a reduced search space. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 3008\u20133017","DOI":"10.1109\/CVPRW50498.2020.00359"},{"key":"1644_CR42","unstructured":"Loshchilov I, Hutter F (2019) Decoupled weight decay regularization. In: Proceedings of the 7th International Conference on Learning Representations (ICLR)"},{"key":"1644_CR43","doi-asserted-by":"crossref","unstructured":"Eslami S, Meinel C, De Melo G (2023) Pubmedclip: How much does clip benefit visual question answering in the medical domain? In: Findings of the Association for Computational Linguistics (EACL), pp 1151\u20131163","DOI":"10.18653\/v1\/2023.findings-eacl.88"},{"key":"1644_CR44","doi-asserted-by":"crossref","unstructured":"Li P, Liu G, Tan L et al (2023) Self-supervised vision-language pretraining for medical visual question answering. In: Proceedings of the 20th International Symposium on Biomedical Imaging (ISBI), pp 1\u20135","DOI":"10.1109\/ISBI53787.2023.10230743"},{"key":"1644_CR45","doi-asserted-by":"crossref","unstructured":"Chen Q, Bian M, Xu H (2024) Mmql: Multi-question learning for medical visual question answering. In: Proceedings of the 27th International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI), pp 480\u2013489","DOI":"10.1007\/978-3-031-72086-4_45"},{"key":"1644_CR46","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2025.103464","volume":"101","author":"C Zhan","year":"2025","unstructured":"Zhan C, Peng P, Wang H et al (2025) Uniclam: contrastive representation learning with adversarial masking for unified and interpretable medical vision question answering. Medical Image Anal 101:103464","journal-title":"Medical Image Anal"},{"key":"1644_CR47","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2024.128730","volume":"613","author":"A Lameesa","year":"2025","unstructured":"Lameesa A, Silpasuwanchai C, Alam MSB (2025) Vg-calf: a vision-guided cross-attention and late-fusion network for radiology images in medical visual question answering. Neurocomputing 613:128730","journal-title":"Neurocomputing"},{"key":"1644_CR48","doi-asserted-by":"crossref","unstructured":"Selvaraju RR, Cogswell M, Das A et al (2017) Grad-cam: Visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp 618\u2013626","DOI":"10.1109\/ICCV.2017.74"}],"container-title":["Pattern Analysis and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-026-01644-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10044-026-01644-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-026-01644-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T19:54:57Z","timestamp":1773172497000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10044-026-01644-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,10]]},"references-count":48,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,6]]}},"alternative-id":["1644"],"URL":"https:\/\/doi.org\/10.1007\/s10044-026-01644-9","relation":{},"ISSN":["1433-7541","1433-755X"],"issn-type":[{"value":"1433-7541","type":"print"},{"value":"1433-755X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,10]]},"assertion":[{"value":"26 October 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 February 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All authors declared no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"66"}}