{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T07:10:21Z","timestamp":1778051421590,"version":"3.51.4"},"reference-count":79,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,6]]},"DOI":"10.1109\/wacv61042.2026.00148","type":"proceedings-article","created":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T19:59:32Z","timestamp":1778011172000},"page":"1459-1470","source":"Crossref","is-referenced-by-count":0,"title":["MarineEval: Assessing the Marine Intelligence of Vision-Language Models"],"prefix":"10.1109","author":[{"given":"Yuk-Kwan","family":"Wong","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tuan-An","family":"To","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jipeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ziqiang","family":"Zheng","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sai-Kit","family":"Yeung","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"World register of marine species (worms)","author":"Ahyong","year":"2025"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00703"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref5","volume-title":"Reef Fish identification: Tropical pacific","author":"Allen","year":"2015"},{"key":"ref6","volume-title":"Introducing the next generation of claude","year":"2024"},{"key":"ref7","author":"Bai","year":"2023","journal-title":"Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.3354\/meps14750"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0850"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0850"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref12","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","author":"Chiang","year":"2023"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.3389\/fmars.2025.1631423"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.3354\/meps14721"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.3389\/fmars.2025.1581778"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1016\/j.jmarsys.2025.104084"},{"key":"ref17","volume-title":"Fishbase","author":"Froese","year":"2025"},{"key":"ref18","article-title":"Mme: A comprehensive evaluation benchmark for multimodal large language models","author":"Fu","year":"2023"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.3354\/meps14753"},{"key":"ref20","article-title":"G-llava: Solving geometric problem with multi-modal large language model","author":"Gao","year":"2023"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s00227-025-04675-8"},{"key":"ref22","volume-title":"Gemini 2.0 flash model card","year":"2024"},{"key":"ref23","volume-title":"Nudibranch & Sea Slug Identification: Indo-Pacific","author":"Goslinear","year":"2019"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref25","article-title":"Coralvqa: A large-scale visual question answering dataset for coral reef image understanding","author":"Han","year":"2025"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00914"},{"key":"ref27","article-title":"Mini-monkey: Multi-scale adaptive cropping for multimodal large language models","author":"Huang","year":"2024"},{"key":"ref28","volume-title":"Reef creature identification: Tropical pacific","author":"Humann","year":"2010"},{"key":"ref29","article-title":"Heron-bench: A benchmark for evaluating vision language models in japanese","author":"Inoue","year":"2024"},{"key":"ref30","volume-title":"The IUCN red list of threatened species","year":"2022"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/WACVW58289.2023.00033"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/s00227-025-04699-0"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-69532-3_30"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01263"},{"key":"ref35","article-title":"Seed-bench: Benchmarking multimodal llms with generative comprehension","author":"Li","year":"2023"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01263"},{"key":"ref37","article-title":"Llava-med: Training a large language-and-vision assistant for biomedicine in one day","author":"Li","year":"2023"},{"key":"ref38","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"International Conference on Machine Learning (ICML)","author":"Li"},{"key":"ref39","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International Conference on Machine Learning (ICML)","author":"Li"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1016\/j.jmarsys.2025.104049"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref42","author":"Liu","year":"2024","journal-title":"Llava-next: Improved reasoning, ocr, and world knowledge"},{"key":"ref43","author":"Lu","year":"2024","journal-title":"Deepseek-vl: Towards real-world vision-language understanding"},{"key":"ref44","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"The 36th Conference on Neural Information Processing Systems (NeurIPS)","author":"Lu"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1016\/j.jmarsys.2025.104048"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01844"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0696"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2024.3432873"},{"key":"ref49","year":"2022","journal-title":"Introducing chatgpt"},{"key":"ref50","volume-title":"Gpt-4 technical report","year":"2023"},{"key":"ref51","year":"2024","journal-title":"Gpt-4o system card"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.23919\/OCEANS52994.2023.10337406"},{"key":"ref53","article-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/s00227-025-04677-6"},{"key":"ref55","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning (ICML)","author":"Radford"},{"key":"ref56","author":"Ren","year":"2024","journal-title":"Grounded sam: Assembling open-world models for diverse visual tasks"},{"key":"ref57","article-title":"Multimedeval: A benchmark and a toolkit for evaluating medical vision-language models","author":"Royer","year":"2024"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/s00227-025-04681-w"},{"key":"ref59","article-title":"Assessing gpt4-v on structured reasoning tasks","author":"Singh","year":"2023"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01836"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01325"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1016\/j.jmarsys.2025.104053"},{"key":"ref63","volume-title":"Oil spill dataset- binary image classification","author":"Reddy","year":"2024"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1016\/j.ecoinf.2025.103402"},{"key":"ref65","volume-title":"Grok-2 beta release","year":"2024"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/s00227-025-04655-y"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3241"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.3389\/fmars.2025.1569186"},{"key":"ref69","article-title":"Internlm-xcomposer-2.5: A versatile large vision language model supporting long-contextual input and output","author":"Zhang","year":"2024"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73242-3_10"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3104230"},{"key":"ref72","article-title":"Marinegpt: Unlocking secrets of ocean to the public","author":"Zheng","year":"2023"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72627-9_14"},{"key":"ref74","article-title":"Exploring boundary of gpt-4v on marine analysis: A preliminary case study","author":"Zheng","year":"2024"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02661"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01857"},{"key":"ref77","article-title":"Exploring recommendation capabilities of gpt-4v (ision): A preliminary case study","author":"Zhou","year":"2023"},{"key":"ref78","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023"},{"key":"ref79","article-title":"Internvl3: Exploring advanced training and test-time recipes for open-source multimodal models","author":"Zhu","year":"2025"}],"event":{"name":"2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","location":"Tucson, AZ, USA","start":{"date-parts":[[2026,3,6]]},"end":{"date-parts":[[2026,3,10]]}},"container-title":["2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11491838\/11491925\/11492634.pdf?arnumber=11492634","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:10:15Z","timestamp":1778047815000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11492634\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,6]]},"references-count":79,"URL":"https:\/\/doi.org\/10.1109\/wacv61042.2026.00148","relation":{},"subject":[],"published":{"date-parts":[[2026,3,6]]}}}