{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T20:47:32Z","timestamp":1758055652557,"version":"3.44.0"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T00:00:00Z","timestamp":1748476800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T00:00:00Z","timestamp":1748476800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Scientific Research Program of the Science and Technology Department of Shaanxi Province, China","award":["2023-YBGY-211"],"award-info":[{"award-number":["2023-YBGY-211"]}]},{"name":"Youth Innovation Team Project of the Science Research Program Project of Shaanxi Provincial Department of Education","award":["23JP164"],"award-info":[{"award-number":["23JP164"]}]},{"name":"Technology Innovation Guidance Special Fund of Shaanxi Province","award":["2024QY-SZX-17"],"award-info":[{"award-number":["2024QY-SZX-17"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s00530-025-01848-9","type":"journal-article","created":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T01:06:34Z","timestamp":1748480794000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A medical visual question-answering model based on multi-scale feature fusion and question Feature enhancement"],"prefix":"10.1007","volume":"31","author":[{"given":"Hong","family":"Xia","sequence":"first","affiliation":[]},{"given":"Yifan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Hui","family":"Jia","sequence":"additional","affiliation":[]},{"given":"Yanping","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Shiyong","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,29]]},"reference":[{"key":"1848_CR1","doi-asserted-by":"crossref","unstructured":"Chen, Z., Li, G., Wan, X.: Align, reason and learn: Enhancing medical vision-and-language pre-training with knowledge. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 5152\u2013 5161 ( 2022)","DOI":"10.1145\/3503161.3547948"},{"key":"1848_CR2","doi-asserted-by":"crossref","unstructured":"Gong, H., Chen, G., Liu, S., Yu, Y., Li, G.: Cross-modal self-attention with multi-task pre-training for medical visual question answering. In: Proceedings of the 2021 International Conference on Multimedia Retrieval, pp. 456\u2013460 ( 2021)","DOI":"10.1145\/3460426.3463584"},{"key":"1848_CR3","doi-asserted-by":"publisher","first-page":"50626","DOI":"10.1109\/ACCESS.2020.2980024","volume":"8","author":"F Ren","year":"2020","unstructured":"Ren, F., Zhou, Y.: Cgmvqa: a new classification and generative model for medical visual question answering. IEEE Access 8, 50626\u201350636 (2020)","journal-title":"IEEE Access"},{"key":"1848_CR4","doi-asserted-by":"crossref","unstructured":"Khare, Y., Bagal, V., Mathew, M., Devi, A., Priyakumar, U.D., Jawahar, C.: Mmbert: multimodal bert pretraining for improved medical vqa. In: 2021 IEEE 18th International Symposium on Biomedical Imaging (ISBI), pp. 1033\u2013 1036 (2021). IEEE","DOI":"10.1109\/ISBI48211.2021.9434063"},{"key":"1848_CR5","doi-asserted-by":"crossref","unstructured":"Tiong, A.M.H., Li, J., Li, B., Savarese, S., Hoi, S.C.: Plug-and-play vqa: zero-shot vqa by conjoining large pretrained models with zero training. In: Findings of the Association for Computational Linguistics: EMNLP 2022, pp. 951\u2013967 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.67"},{"key":"1848_CR6","doi-asserted-by":"crossref","unstructured":"Banerjee, P., Gokhale, T., Yang, Y., Baral, C.: Weaqa: weak supervision via captions for visual question answering. In: Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021, pp. 3420\u20133435 (2021)","DOI":"10.18653\/v1\/2021.findings-acl.302"},{"key":"1848_CR7","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Kukliansy, D., Szpektor, I., Chen, X., Ding, N., Soricut, R.: All you may need for vqa are image captions. In: Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 1947\u20131963 (2022)","DOI":"10.18653\/v1\/2022.naacl-main.142"},{"key":"1848_CR8","unstructured":"Kim, J.-H., Jun, J., Zhang, B.-T.: Bilinear attention networks. In: Proceedings of the 32nd International Conference on Neural Information Processing Systems, pp. 1571\u20131581 (2018)"},{"key":"1848_CR9","doi-asserted-by":"crossref","unstructured":"Nguyen, B.D., Do, T.-T., Nguyen, B.X., Do, T., Tjiputra, E., Tran, Q.D.: Overcoming data limitation in medical visual question answering. In: Medical Image Computing and Computer Assisted Intervention\u2013MICCAI 2019: 22nd International Conference, Shenzhen, China, October 13\u201317, 2019, Proceedings, Part IV 22, pp. 522\u2013530 (2019). Springer","DOI":"10.1007\/978-3-030-32251-9_57"},{"key":"1848_CR10","unstructured":"Finn, C., Abbeel, P., Levine, S.: Model-agnostic meta-learning for fast adaptation of deep networks. In: International Conference on Machine Learning, pp. 1126\u20131135 (2017). PMLR"},{"key":"1848_CR11","doi-asserted-by":"crossref","unstructured":"Masci, J., Meier, U., Cire\u015fan, D., Schmidhuber, J.: Stacked convolutional auto-encoders for hierarchical feature extraction. In: Artificial Neural Networks and Machine Learning\u2013ICANN 2011: 21st International Conference on Artificial Neural Networks, Espoo, Finland, June 14\u201317, 2011, Proceedings, Part I 21, pp. 52\u201359 (2011). Springer","DOI":"10.1007\/978-3-642-21735-7_7"},{"key":"1848_CR12","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021). PMLR"},{"key":"1848_CR13","doi-asserted-by":"crossref","unstructured":"Pelka, O., Koitka, S., R\u00fcckert, J., Nensa, F., Friedrich, C.M.: Radiology objects in context (roco): a multimodal image dataset. In: Intravascular Imaging and Computer Assisted Stenting and Large-Scale Annotation of Biomedical Data and Expert Label Synthesis: 7th Joint International Workshop, CVII-STENT 2018 and Third International Workshop, LABELS 2018, Held in Conjunction with MICCAI 2018, Granada, Spain, September 16, 2018, Proceedings 3, pp. 180\u2013189 (2018). Springer","DOI":"10.1007\/978-3-030-01364-6_20"},{"key":"1848_CR14","doi-asserted-by":"crossref","unstructured":"Eslami, S., Meinel, C., De\u00a0Melo, G.: Pubmedclip: how much does clip benefit visual question answering in the medical domain? In: Findings of the Association for Computational Linguistics: EACL 2023, pp. 1181\u20131193 (2023)","DOI":"10.18653\/v1\/2023.findings-eacl.88"},{"key":"1848_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1848_CR16","unstructured":"Dosovitskiy, A.: An image is worth 16 $$\\times$$ 16 words: transformers for image recognition at scale (2020). arXiv:2010.11929"},{"key":"1848_CR17","doi-asserted-by":"crossref","unstructured":"Liu, B., Zhan, L.-M., Xu, L., Ma, L., Yang, Y., Wu, X.-M.: Slake: a semantically-labeled knowledge-enhanced dataset for medical visual question answering. In: 2021 IEEE 18th International Symposium on Biomedical Imaging (ISBI), pp. 1650\u20131654 (2021). IEEE","DOI":"10.1109\/ISBI48211.2021.9434010"},{"issue":"1","key":"1848_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1038\/sdata.2018.251","volume":"5","author":"JJ Lau","year":"2018","unstructured":"Lau, J.J., Gayen, S., Ben Abacha, A., Demner-Fushman, D.: A dataset of clinically generated visual questions and answers about radiology images. Sci. Data 5(1), 1\u201310 (2018)","journal-title":"Sci. Data"},{"key":"1848_CR19","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: Vqa: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"1848_CR20","unstructured":"Hasan, S.A., Ling, Y., Farri, O., Liu, J., M\u00fcller, H., Lungren, M.: Overview of imageclef 2018 medical domain visual question answering task. In: Proceedings of CLEF 2018 Working Notes (2018)"},{"issue":"8","key":"1848_CR21","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"J Schmidhuber","year":"1997","unstructured":"Schmidhuber, J., Hochreiter, S., et al.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"1848_CR22","doi-asserted-by":"crossref","unstructured":"Cho, K., Van\u00a0Merri\u00ebnboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using rnn encoder-decoder for statistical machine translation (2014). arXiv:1406.1078","DOI":"10.3115\/v1\/D14-1179"},{"issue":"11","key":"1848_CR23","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Signal Process. 45(11), 2673\u20132681 (1997)","journal-title":"IEEE Trans. Signal Process."},{"key":"1848_CR24","unstructured":"Kenton, J.D.M.-W.C., Toutanova, L.K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"1848_CR25","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition (2014). arXiv:1409.1556"},{"issue":"3","key":"1848_CR26","doi-asserted-by":"publisher","first-page":"364","DOI":"10.3390\/diagnostics15030364","volume":"15","author":"S Muksimova","year":"2025","unstructured":"Muksimova, S., Umirzakova, S., Baltayev, J., Cho, Y.-I.: Rl-cervix.net: a hybrid lightweight model integrating reinforcement learning for cervical cell classification. Diagnostics 15(3), 364 (2025)","journal-title":"Diagnostics"},{"key":"1848_CR27","doi-asserted-by":"crossref","unstructured":"Huang, G., Liu, Z., Van Der\u00a0Maaten, L., Weinberger, K.Q.: Densely connected convolutional networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4700\u20134708 (2017)","DOI":"10.1109\/CVPR.2017.243"},{"key":"1848_CR28","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A.: Stacked attention networks for image question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 21\u201329 (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"1848_CR29","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing, pp. 457\u2013468 (2016)","DOI":"10.18653\/v1\/D16-1044"},{"issue":"12","key":"1848_CR30","doi-asserted-by":"publisher","first-page":"5947","DOI":"10.1109\/TNNLS.2018.2817340","volume":"29","author":"Z Yu","year":"2018","unstructured":"Yu, Z., Yu, J., Xiang, C., Fan, J., Tao, D.: Beyond bilinear: generalized multimodal factorized high-order pooling for visual question answering. IEEE Trans. Neural Netw. Learn. Syst. 29(12), 5947\u20135959 (2018)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"1848_CR31","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Fan, J., Tao, D.: Multi-modal factorized bilinear pooling with co-attention learning for visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1821\u20131830 (2017)","DOI":"10.1109\/ICCV.2017.202"},{"key":"1848_CR32","doi-asserted-by":"crossref","unstructured":"Zhan, L.-M., Liu, B., Fan, L., Chen, J., Wu, X.-M.: Medical visual question answering via conditional reasoning. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 2345\u20132354 (2020)","DOI":"10.1145\/3394171.3413761"},{"key":"1848_CR33","doi-asserted-by":"crossref","unstructured":"Liu, B., Zhan, L.-M., Wu, X.-M.: Contrastive pre-training and representation distillation for medical visual question answering based on radiology images. In: Medical Image Computing and Computer Assisted Intervention\u2013MICCAI 2021: 24th International Conference, Strasbourg, France, September 27\u2013October 1, 2021, Proceedings, Part II 24, pp. 210\u2013220 (2021). Springer","DOI":"10.1007\/978-3-030-87196-3_20"},{"key":"1848_CR34","doi-asserted-by":"crossref","unstructured":"Do, T., Nguyen, B.X., Tjiputra, E., Tran, M., Tran, Q.D., Nguyen, A.: Multiple meta-model quantifying for medical visual question answering. In: Medical Image Computing and Computer Assisted Intervention\u2013MICCAI 2021: 24th International Conference, Strasbourg, France, September 27\u2013October 1, 2021, Proceedings, Part V 24, pp. 64\u201374 (2021). Springer","DOI":"10.1007\/978-3-030-87240-3_7"},{"key":"1848_CR35","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"1848_CR36","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2020.113993","volume":"164","author":"D Gupta","year":"2021","unstructured":"Gupta, D., Suman, S., Ekbal, A.: Hierarchical deep multi-modal network for medical visual question answering. Expert Syst. Appl. 164, 113993 (2021)","journal-title":"Expert Syst. Appl."},{"key":"1848_CR37","doi-asserted-by":"crossref","unstructured":"Liu, J., Hu, T., Zhang, Y., Feng, Y., Hao, J., Lv, J., Liu, Z.: Parameter-efficient transfer learning for medical visual question answering. IEEE Trans. Emerg. Top. Comput. Intell. (2023)","DOI":"10.1109\/TETCI.2023.3311333"},{"key":"1848_CR38","doi-asserted-by":"crossref","unstructured":"Peng, Z., Huang, W., Gu, S., Xie, L., Wang, Y., Jiao, J., Ye, Q.: Conformer: local features coupling global representations for visual recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 367\u2013376 (2021)","DOI":"10.1109\/ICCV48922.2021.00042"},{"key":"1848_CR39","doi-asserted-by":"crossref","unstructured":"Guo, J., Han, K., Wu, H., Tang, Y., Chen, X., Wang, Y., Xu, C.: Cmt: convolutional neural networks meet vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12175\u201312185 (2022)","DOI":"10.1109\/CVPR52688.2022.01186"},{"key":"1848_CR40","first-page":"3965","volume":"34","author":"Z Dai","year":"2021","unstructured":"Dai, Z., Liu, H., Le, Q.V., Tan, M.: Coatnet: marrying convolution and attention for all data sizes. Adv. Neural. Inf. Process. Syst. 34, 3965\u20133977 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1848_CR41","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102091","volume":"103","author":"J Yao","year":"2024","unstructured":"Yao, J., Wang, X., Yang, S., Wang, B.: Vitmatte: boosting image matting with pre-trained plain vision transformers. Inf. Fusion 103, 102091 (2024)","journal-title":"Inf. Fusion"},{"issue":"1","key":"1848_CR42","doi-asserted-by":"publisher","first-page":"2400640","DOI":"10.1056\/AIoa2400640","volume":"2","author":"S Zhang","year":"2025","unstructured":"Zhang, S., Xu, Y., Usuyama, N., Xu, H., Bagga, J., Tinn, R., Preston, S., Rao, R., Wei, M., Valluri, N., et al.: A multimodal biomedical foundation model trained from fifteen million image-text pairs. NEJM AI 2(1), 2400640 (2025)","journal-title":"NEJM AI"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01848-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01848-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01848-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T09:04:35Z","timestamp":1757927075000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01848-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,29]]},"references-count":42,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["1848"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01848-9","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,5,29]]},"assertion":[{"value":"9 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"271"}}