{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T13:53:01Z","timestamp":1762609981199,"version":"3.40.3"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031723896"},{"type":"electronic","value":"9783031723902"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-72390-2_54","type":"book-chapter","created":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T10:03:14Z","timestamp":1729591394000},"page":"575-586","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["HiA: Towards Chinese Multimodal LLMs for\u00a0Comparative High-Resolution Joint Diagnosis"],"prefix":"10.1007","author":[{"given":"Xinpeng","family":"Ding","sequence":"first","affiliation":[]},{"given":"Yongqiang","family":"Chu","sequence":"additional","affiliation":[]},{"given":"Renjie","family":"Pi","sequence":"additional","affiliation":[]},{"given":"Hualiang","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Xiaomeng","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,23]]},"reference":[{"key":"54_CR1","doi-asserted-by":"crossref","unstructured":"Baumgartner, M., J\u00e4ger, P.F., Isensee, F., Maier-Hein, K.H.: nndetection: a self-configuring method for medical object detection. In: Medical Image Computing and Computer Assisted Intervention\u2013MICCAI 2021: 24th International Conference, Strasbourg, France, September 27\u2013October 1, 2021, Proceedings, Part V 24. pp. 530\u2013539. Springer (2021)","DOI":"10.1007\/978-3-030-87240-3_51"},{"key":"54_CR2","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: Unleashing multimodal llm\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"54_CR3","unstructured":"Chen, Q., Hu, X., Wang, Z., Hong, Y.: Medblip: Bootstrapping language-image pre-training from 3d medical images and texts. arXiv preprint arXiv:2305.10799 (2023)"},{"key":"54_CR4","unstructured":"Chen, Z., Duan, Y., Wang, W., He, J., Lu, T., Dai, J., Qiao, Y.: Vision transformer adapter for dense predictions (May 2022)"},{"key":"54_CR5","unstructured":"Chiang, W.L., Li, Z., Lin, Z., Sheng, Y., Wu, Z., Zhang, H., Zheng, L., Zhuang, S., Zhuang, Y., Gonzalez, J.E., et\u00a0al.: Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna lmsys. org (accessed 14 April 2023) (2023)"},{"key":"54_CR6","unstructured":"Dai, W., Li, J., Li, D., Huat, A., Zhao, J., Wang, W., Li, B., Fung, P., Hoi, S.: Instructblip: Towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"54_CR7","doi-asserted-by":"crossref","unstructured":"Ding, X., Han, J., Xu, H., Liang, X., Zhang, W., Li, X.: Holistic autonomous driving understanding by bird\u2019s-eye-view injected multi-modal large models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 13668\u201313677 (2024)","DOI":"10.1109\/CVPR52733.2024.01297"},{"key":"54_CR8","unstructured":"Ding, X., Han, J., Xu, H., Zhang, W., Li, X.: Hilm-d: Towards high-resolution understanding in multimodal large language models for autonomous driving. arXiv preprint arXiv:2309.05186 (2023)"},{"key":"54_CR9","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et\u00a0al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"54_CR10","unstructured":"Driess, D., Xia, F., Sajjadi, M.S., Lynch, C., Chowdhery, A., Ichter, B., Wahid, A., Tompson, J., Vuong, Q., Yu, T., et\u00a0al.: Palm-e: An embodied multimodal language model. arXiv preprint arXiv:2303.03378 (2023)"},{"key":"54_CR11","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (Jun 2016). https:\/\/doi.org\/10.1109\/cvpr.2016.90, http:\/\/dx.doi.org\/10.1109\/cvpr.2016.90","DOI":"10.1109\/cvpr.2016.90"},{"key":"54_CR12","unstructured":"Li, C., Wong, C., Zhang, S., Usuyama, N., Liu, H., Yang, J., Naumann, T., Poon, H., Gao, J.: Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems 36 (2024)"},{"key":"54_CR13","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"54_CR14","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning. pp. 12888\u201312900. PMLR (2022)"},{"key":"54_CR15","unstructured":"Li, K., He, Y., Wang, Y., Li, Y., Wang, W., Luo, P., Wang, Y., Wang, L., Qiao, Y.: Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"54_CR16","doi-asserted-by":"crossref","unstructured":"Liu, F., Wu, X., Ge, S., Fan, W., Zou, Y.: Exploring and distilling posterior and prior knowledge for radiology report generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 13753\u201313762 (2021)","DOI":"10.1109\/CVPR46437.2021.01354"},{"key":"54_CR17","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"54_CR18","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"54_CR19","unstructured":"Liu, J., Wang, Z., Ye, Q., Chong, D., Zhou, P., Hua, Y.: Qilin-med-vl: Towards chinese large vision-language model for general healthcare. arXiv preprint arXiv:2310.17956 (2023)"},{"key":"54_CR20","unstructured":"Moor, M., Huang, Q., Wu, S., Yasunaga, M., Dalmia, Y., Leskovec, J., Zakka, C., Reis, E.P., Rajpurkar, P.: Med-flamingo: a multimodal medical few-shot learner. In: Machine Learning for Health (ML4H). pp. 353\u2013367. PMLR (2023)"},{"key":"54_CR21","unstructured":"OpenAI, O.: Gpt-4 technical report (Mar 2023)"},{"key":"54_CR22","unstructured":"Peng, Z., Wang, W., Dong, L., Hao, Y., Huang, S., Ma, S., Wei, F.: Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"54_CR23","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"54_CR24","doi-asserted-by":"publisher","unstructured":"Roberts, R.J.: Pubmed central: The genbank of the published literature. Proceedings of the National Academy of Sciences p. 381\u2013382 (Jan 2001). https:\/\/doi.org\/10.1073\/pnas.98.2.381, http:\/\/dx.doi.org\/10.1073\/pnas.98.2.381","DOI":"10.1073\/pnas.98.2.381"},{"key":"54_CR25","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et\u00a0al.: Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"54_CR26","doi-asserted-by":"crossref","unstructured":"Tu, T., Azizi, S., Driess, D., Schaekermann, M., Amin, M., Chang, P.C., Carroll, A., Lau, C., Tanno, R., Ktena, I., et\u00a0al.: Towards generalist biomedical ai. NEJM AI 1(3), AIoa2300138 (2024)","DOI":"10.1056\/AIoa2300138"},{"key":"54_CR27","unstructured":"Wang, H., Liu, C., Xi, N., Qiang, Z., Zhao, S., Qin, B., Liu, T.: Huatuo: Tuning llama model with chinese medical knowledge. arXiv preprint arXiv:2304.06975 (2023)"},{"issue":"3","key":"54_CR28","doi-asserted-by":"publisher","DOI":"10.1016\/j.metrad.2023.100033","volume":"1","author":"Z Wang","year":"2023","unstructured":"Wang, Z., Liu, L., Wang, L., Zhou, L.: R2gengpt: Radiology report generation with frozen llms. Meta-Radiology 1(3), 100033 (2023)","journal-title":"Meta-Radiology"},{"key":"54_CR29","unstructured":"Wu, C., Zhang, X., Zhang, Y., Wang, Y., Xie, W.: Towards generalist foundation model for radiology. arXiv preprint arXiv:2308.02463 (2023)"},{"key":"54_CR30","doi-asserted-by":"crossref","unstructured":"Xue, Y., Xu, T., Rodney\u00a0Long, L., Xue, Z., Antani, S., Thoma, G.R., Huang, X.: Multimodal recurrent model with attention for automated radiology report generation. In: Medical Image Computing and Computer Assisted Intervention\u2013MICCAI 2018: 21st International Conference, Granada, Spain, September 16-20, 2018, Proceedings, Part I. pp. 457\u2013466. Springer (2018)","DOI":"10.1007\/978-3-030-00928-1_52"},{"key":"54_CR31","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L., at\u00a0al.: Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"54_CR32","unstructured":"Zhang, X., Wu, C., Zhao, Z., Lin, W., Zhang, Y., Wang, Y., Xie, W.: Pmc-vqa: Visual instruction tuning for medical visual question answering. arXiv preprint arXiv:2305.10415 (2023)"},{"key":"54_CR33","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Wang, X., Xu, Z., Yu, Q., Yuille, A., Xu, D.: When radiology report generation meets knowledge graph. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a034, pp. 12910\u201312917 (2020)","DOI":"10.1609\/aaai.v34i07.6989"},{"key":"54_CR34","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72390-2_54","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T10:13:17Z","timestamp":1729591997000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72390-2_54"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031723896","9783031723902"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72390-2_54","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"23 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MICCAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Medical Image Computing and Computer-Assisted Intervention","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Marrakesh","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Morocco","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"miccai2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conferences.miccai.org\/2024\/en\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}