{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,26]],"date-time":"2025-07-26T09:35:00Z","timestamp":1753522500923,"version":"3.40.3"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031720826"},{"type":"electronic","value":"9783031720833"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-72083-3_56","type":"book-chapter","created":{"date-parts":[[2024,10,13]],"date-time":"2024-10-13T18:01:42Z","timestamp":1728842502000},"page":"602-612","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Design as\u00a0Desired: Utilizing Visual Question Answering for\u00a0Multimodal Pre-training"],"prefix":"10.1007","author":[{"given":"Tongkun","family":"Su","sequence":"first","affiliation":[]},{"given":"Jun","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xi","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Haibo","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Qiong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Faqin","family":"Lv","sequence":"additional","affiliation":[]},{"given":"Baoliang","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Ying","family":"Hu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,14]]},"reference":[{"key":"56_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.dib.2019.104863","volume":"28","author":"W Al-Dhabyani","year":"2020","unstructured":"Al-Dhabyani, W., Gomaa, M., Khaled, H., Fahmy, A.: Dataset of breast ultrasound images. Data in brief 28, 104863 (2020)","journal-title":"Data in brief"},{"key":"56_CR2","doi-asserted-by":"crossref","unstructured":"Bai, L., Islam, M., Ren, H.: Cat-vil: Co-attention gated vision-language embedding for visual question localized-answering in robotic surgery. In: International Conference on Medical Image Computing and Computer-Assisted Intervention. pp. 397\u2013407. Springer (2023)","DOI":"10.1007\/978-3-031-43996-4_38"},{"key":"56_CR3","unstructured":"Banerjee, S., Lavie, A.: Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. pp. 65\u201372 (2005)"},{"key":"56_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Z., Song, Y., Chang, T.H., et\u00a0al.: Generating radiology reports via memory-driven transformer. Conference on Empirical Methods in Natural Language Processing (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.112"},{"key":"56_CR5","unstructured":"Devlin, J., Chang, M.W., Lee, K., et\u00a0al.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"56_CR6","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. International Conference on Learning Representations (2020)"},{"key":"56_CR7","unstructured":"Fu, C., Chen, P., Shen, Y., et\u00a0al.: Mme: A comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"56_CR8","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et\u00a0al.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"56_CR9","doi-asserted-by":"crossref","unstructured":"Huang, S.C., Shen, L., Lungren, M.P., et\u00a0al.: Gloria: A multimodal global-local representation learning framework for label-efficient medical image recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 3942\u20133951 (2021)","DOI":"10.1109\/ICCV48922.2021.00391"},{"key":"56_CR10","doi-asserted-by":"crossref","unstructured":"Huang, X., Gong, H.: A dual-attention learning network with word and sentence embedding for medical visual question answering. IEEE Transactions on Medical Imaging (2023)","DOI":"10.1109\/TMI.2023.3322868"},{"key":"56_CR11","doi-asserted-by":"crossref","unstructured":"Jing, B., Xie, P., Xing, E.: On the automatic generation of medical imaging reports. Annual Meeting of the Association for Computational Linguistics (2017)","DOI":"10.18653\/v1\/P18-1240"},{"key":"56_CR12","doi-asserted-by":"crossref","unstructured":"Li, J., Li, S., Hu, Y., et\u00a0al.: A self-guided framework for radiology report generation. In: International Conference on Medical Image Computing and Computer-Assisted Intervention. pp. 588\u2013598. Springer (2022)","DOI":"10.1007\/978-3-031-16452-1_56"},{"key":"56_CR13","unstructured":"Li, J., Li, D., Savarese, S., et\u00a0al.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models (2023)"},{"key":"56_CR14","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., et\u00a0al.: Align before fuse: Vision and language representation learning with momentum distillation. Advances in neural information processing systems 34, 9694\u20139705 (2021)","journal-title":"Advances in neural information processing systems"},{"key":"56_CR15","unstructured":"Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out. pp. 74\u201381 (2004)"},{"key":"56_CR16","doi-asserted-by":"crossref","unstructured":"Liu, B., Zhan, L.M., Wu, X.M.: Contrastive pre-training and representation distillation for medical visual question answering based on radiology images. In: International Conference on Medical Image Computing and Computer-Assisted Intervention. pp. 210\u2013220. Springer (2021)","DOI":"10.1007\/978-3-030-87196-3_20"},{"key":"56_CR17","unstructured":"Maroua, A.: Algerian ultrasound images thyroid dataset: Auitd (2022), https:\/\/www.kaggle.com\/azouzmaroua\/datasets, accessed on February 10, 2023"},{"key":"56_CR18","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"56_CR19","unstructured":"OpenAI: Introducing chatgpt (2023), https:\/\/openai.com\/blog\/chatgpt\/, accessed on January 10, 2023"},{"key":"56_CR20","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., et\u00a0al.: Bleu: a method for automatic evaluation of machine translation (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"56_CR21","doi-asserted-by":"crossref","unstructured":"Pedraza, L., Vargas, C., Narv\u00e1ez, F., et\u00a0al.: An open access thyroid ultrasound image database. In: 10th International symposium on medical information processing and analysis. vol.\u00a09287, pp. 188\u2013193. SPIE (2015)","DOI":"10.1117\/12.2073532"},{"key":"56_CR22","doi-asserted-by":"crossref","unstructured":"Qin, H., Song, Y.: Reinforced cross-modal alignment for radiology report generation. In: Findings of the Association for Computational Linguistics: ACL 2022. pp. 448\u2013458 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.38"},{"key":"56_CR23","unstructured":"Radford, A., Kim, J.W., Hallacy, C., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"56_CR24","unstructured":"Radford, A., Narasimhan, K., Salimans, T., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"key":"56_CR25","unstructured":"Radford, A., Wu, J., Child, R., et\u00a0al.: Language models are unsupervised multitask learners (2019)"},{"key":"56_CR26","unstructured":"Ramesh, A., Pavlov, M., Goh, G., et\u00a0al.: Zero-shot text-to-image generation. In: International conference on machine learning. pp. 8821\u20138831. Pmlr (2021)"},{"key":"56_CR27","unstructured":"Redmon, J., Farhadi, A.: Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767 (2018)"},{"key":"56_CR28","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et\u00a0al.: Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"key":"56_CR29","first-page":"33536","volume":"35","author":"F Wang","year":"2022","unstructured":"Wang, F., Zhou, Y., Wang, S., et\u00a0al.: Multi-granularity cross-modal alignment for generalized medical visual representation learning. Advances in Neural Information Processing Systems 35, 33536\u201333549 (2022)","journal-title":"Advances in Neural Information Processing Systems"},{"key":"56_CR30","unstructured":"Wang, Q., Dai, S., Xu, B., et\u00a0al.: Building chinese biomedical language models via multi-level text discrimination. arXiv preprint arXiv:2110.07244 (2021)"},{"key":"56_CR31","doi-asserted-by":"crossref","unstructured":"Wang, Z., Wu, Z., Agarwal, D., et\u00a0al.: Medclip: Contrastive learning from unpaired medical images and text. Conference on Empirical Methods in Natural Language Processing (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.256"},{"key":"56_CR32","unstructured":"Wu, X., Yang, S., Qiu, Z., et\u00a0al.: Deltanet: Conditional medical report generation for covid-19 diagnosis. International Conference On Computational Linguistics (2022)"},{"key":"56_CR33","doi-asserted-by":"crossref","unstructured":"Xie, Y., Gu, L., Harada, T., et\u00a0al.: Medim: Boost medical image representation via radiology report-guided masking. In: International Conference on Medical Image Computing and Computer-Assisted Intervention. pp. 13\u201323. Springer (2023)","DOI":"10.1007\/978-3-031-43907-0_2"},{"key":"56_CR34","unstructured":"Yang, Y., Yu, J., Zhang, J., et\u00a0al.: Joint embedding of deep visual and semantic features for medical image report generation. IEEE Transactions on Multimedia (2021)"},{"key":"56_CR35","doi-asserted-by":"crossref","unstructured":"Zhao, Z., Chen, H., Zhang, J., et\u00a0al.: Uer: An open-source toolkit for pre-training models. Conference on Empirical Methods in Natural Language Processing-International Joint Conference on Natural Language Processing p.\u00a0241 (2019)","DOI":"10.18653\/v1\/D19-3041"},{"key":"56_CR36","doi-asserted-by":"crossref","unstructured":"Zhao, Z., Li, Y., Hou, C., et\u00a0al.: Tencentpretrain: A scalable and flexible toolkit for pre-training models of different modalities. Annual Meeting of the Association for Computational Linguistics p.\u00a0217 (2023)","DOI":"10.18653\/v1\/2023.acl-demo.20"},{"key":"56_CR37","unstructured":"Zhou, H.Y., Lian, C., Wang, L., et\u00a0al.: Advancing radiograph representation learning with masked record modeling. International Conference on Learning Representations (2023)"}],"container-title":["Lecture Notes in Computer Science","Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72083-3_56","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,13]],"date-time":"2024-10-13T18:13:21Z","timestamp":1728843201000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72083-3_56"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031720826","9783031720833"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72083-3_56","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"14 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MICCAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Medical Image Computing and Computer-Assisted Intervention","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Marrakesh","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Morocco","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"miccai2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conferences.miccai.org\/2024\/en\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}