{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,20]],"date-time":"2025-10-20T18:54:16Z","timestamp":1760986456320,"version":"3.44.0"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032051264","type":"print"},{"value":"9783032051271","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T00:00:00Z","timestamp":1758326400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,20]],"date-time":"2025-09-20T00:00:00Z","timestamp":1758326400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-05127-1_23","type":"book-chapter","created":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T21:15:39Z","timestamp":1758316539000},"page":"235-245","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Hallucination-Aware Multimodal Benchmark for\u00a0Gastrointestinal Image Analysis with\u00a0Large Vision-Language Models"],"prefix":"10.1007","author":[{"given":"Bidur","family":"Khanal","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sandesh","family":"Pokhrel","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sanjay","family":"Bhandari","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ramesh","family":"Rana","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nikesh","family":"Shrestha","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ram B.","family":"Gurung","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cristian","family":"Linte","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Angus","family":"Watson","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yash R.","family":"Shrestha","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Binod","family":"Bhattarai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,9,20]]},"reference":[{"key":"23_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"issue":"1","key":"23_CR2","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1053\/j.gastro.2020.02.068","volume":"159","author":"M Arnold","year":"2020","unstructured":"Arnold, M., et al.: Global burden of 5 major types of gastrointestinal cancer. Gastroenterology 159(1), 335\u2013349 (2020)","journal-title":"Gastroenterology"},{"key":"23_CR3","unstructured":"Bai, Z., Wang, P., Xiao, T., He, T., Han, Z., Zhang, Z., et\u00a0al.: Hallucination of multimodal large language models: a survey. arXiv preprint arXiv:2404.18930 (2024)"},{"key":"23_CR4","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: ACL Workshops, pp. 65\u201372 (2005)"},{"issue":"1","key":"23_CR5","doi-asserted-by":"publisher","first-page":"283","DOI":"10.1038\/s41597-020-00622-y","volume":"7","author":"H Borgli","year":"2020","unstructured":"Borgli, H., et al.: HyperKvasir, a comprehensive multi-class image and video dataset for gastrointestinal endoscopy. Sci. Data 7(1), 283 (2020)","journal-title":"Sci. Data"},{"key":"23_CR6","unstructured":"Chen, J., et al.: Detecting and evaluating medical hallucinations in large vision language models. arXiv preprint arXiv:2406.10185 (2024)"},{"key":"23_CR7","doi-asserted-by":"publisher","DOI":"10.1016\/j.compbiomed.2019.103351","volume":"111","author":"T Cogan","year":"2019","unstructured":"Cogan, T., Cogan, M., Tamil, L.: MAPGI: accurate identification of anatomical landmarks and diseased tissue in gastrointestinal tract using deep learning. Comput. Biol. Med. 111, 103351 (2019)","journal-title":"Comput. Biol. Med."},{"issue":"2","key":"23_CR8","doi-asserted-by":"publisher","first-page":"275","DOI":"10.1007\/s10120-022-01358-x","volume":"26","author":"H Du","year":"2023","unstructured":"Du, H., et al.: A deep-learning based system using multi-modal data for diagnosing gastric neoplasms in real-time (with video). Gastric Cancer 26(2), 275\u2013285 (2023)","journal-title":"Gastric Cancer"},{"issue":"8017","key":"23_CR9","doi-asserted-by":"publisher","first-page":"625","DOI":"10.1038\/s41586-024-07421-0","volume":"630","author":"S Farquhar","year":"2024","unstructured":"Farquhar, S., Kossen, J., Kuhn, L., Gal, Y.: Detecting hallucinations in large language models using semantic entropy. Nature 630(8017), 625\u2013630 (2024)","journal-title":"Nature"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Gautam, S., et al.: Kvasir-VQA: a text-image pair GI tract dataset. In: Proceedings of the VLM4Bio Workshop, pp. 3\u201312. ACM (2024)","DOI":"10.1145\/3689096.3689458"},{"key":"23_CR11","doi-asserted-by":"crossref","unstructured":"Gunjal, A., Yin, J., Bas, E.: Detecting and preventing hallucinations in large vision language models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 18135\u201318143 (2024)","DOI":"10.1609\/aaai.v38i16.29771"},{"key":"23_CR12","unstructured":"Hicks, S., Stor\u00e5s, A.M., Halvorsen, P., de\u00a0Lange, T., Riegler, M., Thambawita, V.L.: Overview of ImageCLEFmedical 2023 - medical visual question answering for gastrointestinal tract. In: CLEF, pp. 1316\u20131327 (2023)"},{"key":"23_CR13","unstructured":"Hu, E.J., et\u00a0al.: LoRA: low-rank adaptation of large language models. In: ICLR, vol.\u00a01, p.\u00a03 (2022)"},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Jha, D., et\u00a0al.: GastroVision: a multi-class endoscopy image dataset for computer aided gastrointestinal disease detection. In: ICML ML4MHD Workshop, pp. 125\u2013140. Springer (2023)","DOI":"10.1007\/978-3-031-47679-2_10"},{"key":"23_CR15","doi-asserted-by":"crossref","unstructured":"Jiang, Y., et al.: CoMT: chain-of-medical-thought reduces hallucination in medical report generation. In: ICASSP, pp.\u00a01\u20135. IEEE (2025)","DOI":"10.1109\/ICASSP49660.2025.10887699"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Keshavarz, P., Bagherieh, S., Nabipoorashrafi, S.A., Chalian, H., Rahsepar, A.A., Kim, G.H.J., et\u00a0al.: ChatGPT in radiology: a systematic review of performance, pitfalls, and future perspectives. Diagnostic and interventional imaging (2024)","DOI":"10.1016\/j.diii.2024.04.003"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Li, J., et\u00a0al.: A deep learning and natural language processing-based system for automatic identification and surveillance of high-risk patients undergoing upper endoscopy: a multicenter study. EClinicalMedicine 53 (2022)","DOI":"10.1016\/j.eclinm.2022.101704"},{"key":"23_CR18","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"23_CR19","unstructured":"Liu, A., et\u00a0al.: DeepSeek-V3 technical report. arXiv preprint arXiv:2412.19437 (2024)"},{"key":"23_CR20","unstructured":"Liu, F., Lin, K., Li, L., Wang, J., Yacoob, Y., Wang, L.: Mitigating hallucination in large multi-modal models via robust instruction tuning. In: ICLR (2024)"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: CVPR, pp. 26296\u201326306 (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"23_CR22","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. NeurIPS 36, 34892\u201334916 (2023)","journal-title":"NeurIPS"},{"issue":"6","key":"23_CR23","doi-asserted-by":"publisher","first-page":"269","DOI":"10.1159\/000477739","volume":"24","author":"S Marques","year":"2017","unstructured":"Marques, S., Bispo, M., Pimentel-Nunes, P., Chagas, C., Dinis-Ribeiro, M.: Image documentation in gastrointestinal endoscopy: review of recommendations. GE-Portuguese J. Gastroenterol. 24(6), 269\u2013274 (2017)","journal-title":"GE-Portuguese J. Gastroenterol."},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Haile, M.B., Ayodeji Olalekan\u00a0Salau, B.E., Belay, A.J.: Detection and classification of gastrointestinal disease using convolutional neural network and SVM. Cogent Eng. 9(1), 2084878 (2022)","DOI":"10.1080\/23311916.2022.2084878"},{"issue":"1","key":"23_CR25","doi-asserted-by":"publisher","first-page":"465","DOI":"10.1146\/annurev-psych-010416-044022","volume":"68","author":"J Metcalfe","year":"2017","unstructured":"Metcalfe, J.: Learning from errors. Ann. Rev. Psychol. 68(1), 465\u2013489 (2017)","journal-title":"Ann. Rev. Psychol."},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"23_CR27","doi-asserted-by":"crossref","unstructured":"Pogorelov, K., et al.: KVASIR: a multi-class image dataset for computer aided gastrointestinal disease detection. In: Proceedings of the ACM MMSys, pp. 164\u2013169 (2017)","DOI":"10.1145\/3083187.3083212"},{"key":"23_CR28","unstructured":"Pokhrel, S., et\u00a0al.: NCDD: nearest centroid distance deficit for out-of-distribution detection in gastrointestinal vision. arXiv preprint arXiv:2412.01590 (2024)"},{"key":"23_CR29","doi-asserted-by":"crossref","unstructured":"Pokhrel, S., Bhandari, S., Vazquez, E., Lambrou, T., Gyawali, P., Bhattarai, B.: TTA-OOD: test-time augmentation for improving out-of-distribution detection in gastrointestinal vision. In: MICCAI DEMI Workshop, pp. 33\u201342. Springer (2024)","DOI":"10.1007\/978-3-031-73748-0_4"},{"issue":"1","key":"23_CR30","doi-asserted-by":"publisher","first-page":"4171","DOI":"10.1038\/s41598-023-31223-5","volume":"13","author":"A Selivanov","year":"2023","unstructured":"Selivanov, A., Rogov, O.Y., Chesakov, D., Shelmanov, A., Fedulova, I., Dylov, D.V.: Medical image captioning via generative pretrained transformers. Sci. Rep. 13(1), 4171 (2023)","journal-title":"Sci. Rep."},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Shieh, A., Tran, B., He, G., Kumar, M., Freed, J.A., Majety, P.: Assessing ChatGPT 4.0\u2019s test performance and clinical diagnostic accuracy on USMLE STEP 2 CK and clinical case reports. Sci. Rep. 14(1), 9330 (2024)","DOI":"10.1038\/s41598-024-58760-x"},{"key":"23_CR32","unstructured":"Shrestha, P., Amgain, S., Khanal, B., Linte, C.A., Bhattarai, B.: Medical vision language pretraining: a survey. arXiv preprint arXiv:2312.06224 (2023)"},{"key":"23_CR33","unstructured":"Singhal, K., et\u00a0al.: Toward expert-level medical question answering with large language models. Nat. Med., 1\u20138 (2025)"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs. In: CVPR, pp. 9568\u20139578 (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"23_CR35","unstructured":"Wikipedia contributors: coefficient of variation (2025). https:\/\/en.wikipedia.org\/wiki\/Coefficient_of_variation. Accessed 27 Feb 2025"},{"key":"23_CR36","unstructured":"Yang, A., et\u00a0al.: Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025)"},{"key":"23_CR37","unstructured":"Ye, J., et al.: mPLUG-Owl3: towards long image-sequence understanding in multi-modal large language models. In: ICLR (2025)"},{"issue":"6","key":"23_CR38","doi-asserted-by":"publisher","first-page":"2578","DOI":"10.1007\/s10278-023-00844-7","volume":"36","author":"S Zhu","year":"2023","unstructured":"Zhu, S., et al.: Public imaging datasets of gastrointestinal endoscopy for artificial intelligence: a review. J. Digit. Imaging 36(6), 2578\u20132601 (2023)","journal-title":"J. Digit. Imaging"}],"container-title":["Lecture Notes in Computer Science","Medical Image Computing and Computer Assisted Intervention \u2013 MICCAI 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-05127-1_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,19]],"date-time":"2025-09-19T21:15:46Z","timestamp":1758316546000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-05127-1_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,20]]},"ISBN":["9783032051264","9783032051271"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-05127-1_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,20]]},"assertion":[{"value":"20 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"MICCAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Medical Image Computing and Computer-Assisted Intervention","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Daejeon","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Korea (Republic of)","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"miccai2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/conferences.miccai.org\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}