{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,13]],"date-time":"2025-10-13T00:54:04Z","timestamp":1760316844299,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031882166"},{"type":"electronic","value":"9783031882173"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88217-3_15","type":"book-chapter","created":{"date-parts":[[2025,5,26]],"date-time":"2025-05-26T10:23:46Z","timestamp":1748255026000},"page":"211-225","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Vision-Language Multimodal Fusion in\u00a0Dermatological Disease Classification"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8838-064X","authenticated-orcid":false,"given":"Moreno","family":"La Quatra","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9631-5802","authenticated-orcid":false,"given":"Nicole Dalia","family":"Cilia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8718-111X","authenticated-orcid":false,"given":"Vincenzo","family":"Conti","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1976-031X","authenticated-orcid":false,"given":"Salvatore","family":"Sorce","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5759-3332","authenticated-orcid":false,"given":"Giovanni","family":"Garraffa","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1048-7380","authenticated-orcid":false,"given":"Valerio Mario","family":"Salerno","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,27]]},"reference":[{"key":"15_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"15_CR2","unstructured":"Arevalo, J., Solorio, T., Montes-y G\u00f3mez, M., Gonz\u00e1lez, F.A.: Gated multimodal units for information fusion. arXiv preprint arXiv:1702.01992 (2017)"},{"key":"15_CR3","doi-asserted-by":"publisher","unstructured":"Benedetto, I., La\u00a0Quatra, M., Cagliero, L., Vassio, L., Trevisan, M.: Transformer-based prediction of emotional reactions to online social network posts. In: Proceedings of the 13th Workshop on Computational Approaches to Subjectivity, Sentiment, & Social Media Analysis, pp. 354\u2013364 (2023). https:\/\/doi.org\/10.18653\/v1\/2023.wassa-1.31","DOI":"10.18653\/v1\/2023.wassa-1.31"},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Z., et\u00a0al.: InternVL: scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24185\u201324198 (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"15_CR5","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Burstein, J., Doran, C., Solorio, T. (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota, June 2019. https:\/\/doi.org\/10.18653\/v1\/N19-1423. https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"key":"15_CR6","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"15_CR7","doi-asserted-by":"crossref","DOI":"10.1016\/j.nahs.2023.101360","volume":"49","author":"F D\u2019Ippolito","year":"2023","unstructured":"D\u2019Ippolito, F., Garraffa, G., Sferlazza, A., Zaccarian, L.: A hybrid observer for localization from noisy inertial data and sporadic position measurements. Nonlinear Anal. Hybrid Syst 49, 101360 (2023)","journal-title":"Nonlinear Anal. Hybrid Syst"},{"key":"15_CR8","doi-asserted-by":"crossref","unstructured":"Esteva, A., et al.: Dermatologist-level classification of skin cancer with deep neural networks. Nature 542(7639), 115\u2013118 (2017)","DOI":"10.1038\/nature21056"},{"issue":"1","key":"15_CR9","doi-asserted-by":"publisher","first-page":"999","DOI":"10.1109\/TIE.2021.3050354","volume":"69","author":"G Garraffa","year":"2022","unstructured":"Garraffa, G., Sferlazza, A., D\u2019Ippolito, F., Alonge, F.: Localization based on parallel robots kinematics as an alternative to trilateration. IEEE Trans. Industr. Electron. 69(1), 999\u20131010 (2022). https:\/\/doi.org\/10.1109\/TIE.2021.3050354","journal-title":"IEEE Trans. Industr. Electron."},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Gouda, N., Amudha, J.: Skin cancer classification using ResNet. In: 2020 IEEE 5th International Conference on Computing Communication and Automation (ICCCA), pp. 536\u2013541. IEEE (2020)","DOI":"10.1109\/ICCCA49541.2020.9250855"},{"key":"15_CR11","doi-asserted-by":"publisher","unstructured":"Gururangan, S., et al.: Don\u2019t stop pretraining: adapt language models to domains and tasks. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J. (eds.) Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 8342\u20138360. Association for Computational Linguistics, Online, July 2020. https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.740. https:\/\/aclanthology.org\/2020.acl-main.740","DOI":"10.18653\/v1\/2020.acl-main.740"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"15_CR13","volume":"8","author":"J Ji","year":"2024","unstructured":"Ji, J., Hou, Y., Chen, X., Pan, Y., Xiang, Y.: Vision-language model for generating textual descriptions from clinical images: model development and validation study. JMIR Formative Res. 8, e32690 (2024)","journal-title":"JMIR Formative Res."},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Koudounas, A., et al.: Towards comprehensive subgroup performance analysis in speech models. IEEE\/ACM Trans. Audio Speech Lang. Process. (2024)","DOI":"10.1109\/TASLP.2024.3363447"},{"key":"15_CR15","doi-asserted-by":"publisher","unstructured":"La\u00a0Quatra, M., Cagliero, L.: Transformer-based highlights extraction from scientific papers. Knowl.-Based Syst. 252 (2022). https:\/\/doi.org\/10.1016\/j.knosys.2022.109382","DOI":"10.1016\/j.knosys.2022.109382"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Li, C., et al.: LlaVA-med: training a large language-and-vision assistant for biomedicine in one day. In: Advances in Neural Information Processing Systems, vol. 36 (2024)","DOI":"10.32388\/VLXB6M"},{"key":"15_CR17","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: A review of deep learning-based information fusion techniques for multimodal medical image classification. Comput. Biol. Med., 108635 (2024)","DOI":"10.1016\/j.compbiomed.2024.108635"},{"key":"15_CR18","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"issue":"6","key":"15_CR19","doi-asserted-by":"crossref","first-page":"e271","DOI":"10.1016\/S2589-7500(19)30123-2","volume":"1","author":"X Liu","year":"2019","unstructured":"Liu, X., et al.: A comparison of deep learning performance against health-care professionals in detecting diseases from medical imaging: a systematic review and meta-analysis. Lancet Digit. Health 1(6), e271\u2013e297 (2019)","journal-title":"Lancet Digit. Health"},{"key":"15_CR20","unstructured":"Liu, Y.: RoBERTa: a robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"15_CR21","doi-asserted-by":"crossref","unstructured":"Perez, E., Strub, F., De\u00a0Vries, H., Dumoulin, V., Courville, A.: Film: visual reasoning with a general conditioning layer. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"15_CR22","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, pp. 10347\u201310357. PMLR (2021)"},{"key":"15_CR23","unstructured":"Touvron, H., et\u00a0al.: LlaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"15_CR24","doi-asserted-by":"crossref","unstructured":"Vaiani, L., La\u00a0Quatra, M., Cagliero, L., Garza, P.: Leveraging multimodal content for podcast summarization. In: Proceedings of the 37th ACM\/SIGAPP Symposium on Applied Computing, pp. 863\u2013870 (2022)","DOI":"10.1145\/3477314.3507106"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Vaiani, L., La\u00a0Quatra, M., Cagliero, L., Garza, P.: ViPER: video-based perceiver for emotion recognition. In: Proceedings of the 3rd International on Multimodal Sentiment Analysis Workshop and Challenge, pp. 67\u201373 (2022)","DOI":"10.1145\/3551876.3554806"},{"key":"15_CR26","doi-asserted-by":"crossref","unstructured":"Zhang, J., Huang, J., Jin, S., Lu, S.: Vision-language models for vision tasks: a survey. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3369699"},{"issue":"1","key":"15_CR27","doi-asserted-by":"crossref","first-page":"5649","DOI":"10.1038\/s41467-024-50043-3","volume":"15","author":"J Zhou","year":"2024","unstructured":"Zhou, J., et al.: Pre-trained multimodal large language model enhances dermatological diagnosis using SkinGPT-4. Nat. Commun. 15(1), 5649 (2024)","journal-title":"Nat. Commun."}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition. ICPR 2024 International Workshops and Challenges"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88217-3_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T19:13:59Z","timestamp":1760296439000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88217-3_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031882166","9783031882173"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88217-3_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"27 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}