{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,29]],"date-time":"2025-10-29T17:39:54Z","timestamp":1761759594653,"version":"build-2065373602"},"reference-count":58,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T00:00:00Z","timestamp":1759276800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92270108"],"award-info":[{"award-number":["92270108"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["XHD23F0201"],"award-info":[{"award-number":["XHD23F0201"]}]},{"name":"Foundation of Muyuan Laboratory","award":["14106022401","14106022402"],"award-info":[{"award-number":["14106022401","14106022402"]}]},{"name":"Research Center for Industries of the Future (RCIF) at Westlake University"},{"DOI":"10.13039\/100005625","name":"Westlake Education Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100005625","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Med. Imaging"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1109\/tmi.2025.3584857","type":"journal-article","created":{"date-parts":[[2025,7,2]],"date-time":"2025-07-02T13:44:57Z","timestamp":1751463897000},"page":"4087-4097","source":"Crossref","is-referenced-by-count":0,"title":["PathBench: Advancing the Benchmark of Large Multimodal Models for Pathology Image Understanding at Patch and Whole Slide Level"],"prefix":"10.1109","volume":"44","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1277-4316","authenticated-orcid":false,"given":"Yuxuan","family":"Sun","sequence":"first","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8642-4569","authenticated-orcid":false,"given":"Hao","family":"Wu","sequence":"additional","affiliation":[{"name":"Faculty of Innovation Engineering, Macau University of Science and Technology, Macau, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5705-3718","authenticated-orcid":false,"given":"Chenglu","family":"Zhu","sequence":"additional","affiliation":[{"name":"School of Engineering, Westlake University, Hangzhou, China"}]},{"given":"Yixuan","family":"Si","sequence":"additional","affiliation":[{"name":"School of Engineering, Westlake University, Hangzhou, China"}]},{"given":"Qizi","family":"Chen","sequence":"additional","affiliation":[{"name":"Wuxi School of Medicine, Jiangnan University, Wuxi, China"}]},{"given":"Yunlong","family":"Zhang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]},{"given":"Kai","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, The Ohio State University, Columbus, OH, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6519-5043","authenticated-orcid":false,"given":"Jingxiong","family":"Li","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Hangzhou, China"}]},{"given":"Jiatong","family":"Cai","sequence":"additional","affiliation":[{"name":"School of Engineering, Westlake University, Hangzhou, China"}]},{"given":"Yuhan","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Hangzhou City University, Hangzhou, China"}]},{"given":"Lin","family":"Sun","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Hangzhou City University, Hangzhou, China"}]},{"given":"Tao","family":"Lin","sequence":"additional","affiliation":[{"name":"School of Engineering, Westlake University, Hangzhou, China"}]},{"given":"Lin","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Engineering, Westlake University, Hangzhou, China"}]}],"member":"263","reference":[{"volume-title":"Robbins and Cotran Pathologic Basis of Disease","year":"2014","author":"Kumar","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28308"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1038\/s41591-023-02504-3"},{"key":"ref4","first-page":"37995","article-title":"Quilt-1M: One million image-text pairs for histopathology","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","author":"Ikezogwo"},{"key":"ref5","first-page":"1","article-title":"Pathgen-1.6M: 1.6 million pathology image-text pairs generation through multi-agent collaboration","volume-title":"Proc. 13th Int. Conf. Learn. Represent.","author":"Sun"},{"key":"ref6","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"139","author":"Radford"},{"key":"ref7","article-title":"PMC-VQA: Visual instruction tuning for medical visual question answering","author":"Zhang","year":"2023","journal-title":"arXiv:2305.10415"},{"key":"ref8","article-title":"Quilt-LLaVA: Visual instruction tuning by extracting localized narratives from open-source histopathology videos","author":"Saygin Seyfioglu","year":"2023","journal-title":"arXiv:2312.04746"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_4"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72083-3_18"},{"key":"ref11","article-title":"WsiCaption: Multiple instance generation of pathology reports for gigapixel whole-slide images","volume-title":"arXiv:2311.16480","author":"Chen","year":"2023"},{"key":"ref12","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. NIPS","author":"Brown"},{"issue":"1","key":"ref13","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref14","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv:2302.13971"},{"key":"ref15","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref16","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","author":"Alayrac"},{"key":"ref17","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"volume-title":"Introducing Our Multimodal Models","year":"2023","author":"Bavishi","key":"ref18"},{"volume-title":"GPT-4Vision System Card","year":"2023","key":"ref19"},{"key":"ref20","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv:2312.11805"},{"key":"ref21","article-title":"Qwen-VL: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023","journal-title":"arXiv:2308.12966"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-43987-2_38"},{"key":"ref23","article-title":"FinVis-GPT: A multimodal large language model for financial chart analysis","author":"Wang","year":"2023","journal-title":"arXiv:2308.01430"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671801"},{"key":"ref25","article-title":"Advancing multimodal medical capabilities of Gemini","author":"Yang","year":"2024","journal-title":"arXiv:2405.03162"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1038\/s41591-024-03185-2"},{"key":"ref27","first-page":"28541","article-title":"LLaVA-med: Training a large language-and-vision assistant for biomedicine in one day","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","author":"Li"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1038\/s41591-024-02856-4"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01893"},{"key":"ref30","first-page":"26650","article-title":"Lamm: Language-assisted multi-modal instruction-tuning dataset, framework, and benchmark","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","author":"Yin"},{"key":"ref31","article-title":"LVLM-eHub: A comprehensive evaluation benchmark for large vision-language models","author":"Xu","year":"2023","journal-title":"arXiv:2306.09265"},{"key":"ref32","article-title":"SEED-bench: Benchmarking multimodal LLMs with generative comprehension","author":"Li","year":"2023","journal-title":"arXiv:2307.16125"},{"key":"ref33","article-title":"MMBench: Is your multi-modal model an all-around player?","author":"Liu","year":"2023","journal-title":"arXiv:2307.06281"},{"key":"ref34","article-title":"MM-vet: Evaluating large multimodal models for integrated capabilities","author":"Yu","year":"2023","journal-title":"arXiv:2308.02490"},{"key":"ref35","article-title":"BenchLMM: Benchmarking cross-style visual capability of large multimodal models","author":"Cai","year":"2023","journal-title":"arXiv:2312.02896"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref37","article-title":"Evaluation and analysis of hallucination in large vision-language models","author":"Wang","year":"2023","journal-title":"arXiv:2308.15126"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01363"},{"key":"ref39","article-title":"Are we on the right way for evaluating large vision-language models?","author":"Chen","year":"2024","journal-title":"arXiv:2403.20330"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1038\/sdata.2018.251"},{"key":"ref41","first-page":"1081","article-title":"Overview of the VQA-med task at ImageCLEF 2021: Visual question answering and generation in the medical domain","volume-title":"Proc. CLEF Conf. Labs Eval. Forum-Work","author":"Abacha"},{"key":"ref42","article-title":"PathVQA: 30000+ questions for medical visual question answering","author":"He","year":"2020","journal-title":"arXiv:2003.10286"},{"key":"ref43","first-page":"1","article-title":"Caption generation from histopathology whole-slide images using pre-trained transformers","volume-title":"Proc. Medical Imag. Deep Learn., Short Paper Track","author":"Guevara"},{"key":"ref44","article-title":"What a whole slide image can tell? Subtype-guided masked transformer for pathological image captioning","author":"Qin","year":"2023","journal-title":"arXiv:2310.20607"},{"volume-title":"Introducing Meta LLAMA 3: The Most Capable Openly Available LLM to Date","year":"2024","key":"ref45"},{"key":"ref46","article-title":"Phi-3 technical report: A highly capable language model locally on your phone","volume-title":"arXiv:2404.14219","author":"Abdin","year":"2024"},{"volume-title":"Hello GPT-4O","year":"2024","key":"ref47"},{"key":"ref48","article-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023","journal-title":"arXiv:2306.14824"},{"key":"ref49","article-title":"LLaMA-adapter v2: Parameter-efficient visual instruction model","author":"Gao","year":"2023","journal-title":"arXiv:2304.15010"},{"key":"ref50","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv:2304.10592"},{"key":"ref51","article-title":"Otter: A multi-modal model with in-context instruction tuning","author":"Li","year":"2023","journal-title":"arXiv:2305.03726"},{"key":"ref52","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inform. Process. Syst.","author":"Liu"},{"key":"ref53","article-title":"Yi: Open foundation models by 01.AI","author":"Young","year":"2024","journal-title":"arXiv:2403.04652"},{"volume-title":"Minicpm-v 2.0: An Efficient End-Side MLLM with Strong OCR and Understanding Capabilities","year":"2024","author":"Team","key":"ref54"},{"volume-title":"Minicpm-V","year":"2024","author":"Team","key":"ref55"},{"volume-title":"Llava-Next: Stronger Llms Supercharge Multimodal Capabilities in the Wild","year":"2024","author":"Li","key":"ref56"},{"key":"ref57","article-title":"Cambrian-1: A fully open, vision-centric exploration of multimodal LLMs","author":"Tong","year":"2024","journal-title":"arXiv:2406.16860"},{"key":"ref58","article-title":"DeepSeek-V3 technical report","volume-title":"arXiv:2412.19437","author":"Liu","year":"2024"}],"container-title":["IEEE Transactions on Medical Imaging"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/42\/11218268\/11062674.pdf?arnumber=11062674","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,29]],"date-time":"2025-10-29T17:35:29Z","timestamp":1761759329000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11062674\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10]]},"references-count":58,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tmi.2025.3584857","relation":{},"ISSN":["0278-0062","1558-254X"],"issn-type":[{"type":"print","value":"0278-0062"},{"type":"electronic","value":"1558-254X"}],"subject":[],"published":{"date-parts":[[2025,10]]}}}