{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T15:52:58Z","timestamp":1778255578120,"version":"3.51.4"},"reference-count":47,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:00:00Z","timestamp":1764547200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100007162","name":"Guangdong Science and Technology Department","doi-asserted-by":"publisher","award":["2024ZDZX2004"],"award-info":[{"award-number":["2024ZDZX2004"]}],"id":[{"id":"10.13039\/501100007162","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Henan Key Laboratory of Imaging and Intelligent Processing","award":["HKLIIP2023-A07"],"award-info":[{"award-number":["HKLIIP2023-A07"]}]},{"name":"Guangdong Provincial Key Lab of Integrated Communication Sensing and Computation for Ubiquitous Internet of Things","award":["2023B1212010007"],"award-info":[{"award-number":["2023B1212010007"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Biomed. Health Inform."],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1109\/jbhi.2025.3538324","type":"journal-article","created":{"date-parts":[[2025,2,13]],"date-time":"2025-02-13T13:43:01Z","timestamp":1739454181000},"page":"9027-9040","source":"Crossref","is-referenced-by-count":7,"title":["Enhancing Visual Reasoning With LLM-Powered Knowledge Graphs for Visual Question Localized-Answering in Robotic Surgery"],"prefix":"10.1109","volume":"29","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-8523-818X","authenticated-orcid":false,"given":"Pengfei","family":"Hao","sequence":"first","affiliation":[{"name":"ROAS Thrust, Systems Hub, The Hong Kong University of Science and Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9726-4253","authenticated-orcid":false,"given":"Hongqiu","family":"Wang","sequence":"additional","affiliation":[{"name":"ROAS Thrust, Systems Hub, The Hong Kong University of Science and Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7344-7733","authenticated-orcid":false,"given":"Guang","family":"Yang","sequence":"additional","affiliation":[{"name":"Bioengineering\/Imperial-X, Imperial College London, London, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3871-663X","authenticated-orcid":false,"given":"Lei","family":"Zhu","sequence":"additional","affiliation":[{"name":"Henan Key Laboratory of Imaging and Intelligent Processing, Department of Electronic and Computer Engineering, The Hong Kong University of Science and Technology, Hong Kong, SAR, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-021-98390-1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160647"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/s11548-024-03141-y"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-16449-1_4"},{"key":"ref5","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"33","author":"Brown","year":"2020"},{"key":"ref6","article-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.artmed.2023.102611"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1038\/s41746-023-00952-2"},{"key":"ref9","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-43996-4_27"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-43996-4_38"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/icra48891.2023.10160403"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00613"},{"key":"ref14","first-page":"28611","article-title":"Text promptable surgical instrument segmentation with vision-language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Zhou","year":"2023"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TMI.2024.3426953"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.3233\/SW-212959"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2890628"},{"key":"ref18","article-title":"VisualBERT: A simple and performant baseline for vision and language","author":"Li","year":"2019"},{"key":"ref19","article-title":"SimVLM: Simple visual language model pretraining with weak supervision","author":"Wang","year":"2021"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2025.103644"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72089-5_29"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref23","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Li","year":"2021"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref26","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Jia","year":"2021"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2857768"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01419"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.558"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2019.112948"},{"key":"ref32","first-page":"3499","article-title":"Stochastic beams and where to find them: The gumbel-top-k trick for sampling sequences without replacement","volume-title":"Proc. 36th Int. Conf. Mach. Learn.","author":"Kool","year":"2019"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref34","article-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"ref35","first-page":"8792","article-title":"Generalized cross entropy loss for training deep neural networks with noisy labels","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"31","author":"Zhang","year":"2018"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00075"},{"key":"ref37","article-title":"2018 robotic scene segmentation challenge","author":"Allan","year":"2020"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-59716-0_60"},{"key":"ref39","article-title":"2017 robotic instrument segmentation challenge","author":"Allan","year":"2019"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2018.2817340"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018102"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.285"},{"issue":"2","key":"ref43","doi-asserted-by":"crossref","first-page":"1","DOI":"10.5121\/ijdkp.2015.5201","article-title":"A review on evaluation metrics for data classification evaluations","volume":"5","author":"Hossin","year":"2015","journal-title":"Int. J. Data Mining Knowl. Manage. Process"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.3390\/fi15060192"},{"key":"ref45","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017"},{"key":"ref46","article-title":"Claude 3 model card","year":"2023"},{"key":"ref47","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Reid","year":"2024"}],"container-title":["IEEE Journal of Biomedical and Health Informatics"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6221020\/11284538\/10886944.pdf?arnumber=10886944","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T18:34:45Z","timestamp":1765305285000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10886944\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12]]},"references-count":47,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/jbhi.2025.3538324","relation":{},"ISSN":["2168-2194","2168-2208"],"issn-type":[{"value":"2168-2194","type":"print"},{"value":"2168-2208","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12]]}}}