{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T15:43:56Z","timestamp":1779291836891,"version":"3.51.4"},"reference-count":56,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.00716","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"7634-7644","source":"Crossref","is-referenced-by-count":1,"title":["AgroBench: Vision-Language Model Benchmark in Agriculture"],"prefix":"10.1109","author":[{"given":"Risa","family":"Shinoda","sequence":"first","affiliation":[{"name":"The University of Osaka"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nakamasa","family":"Inoue","sequence":"additional","affiliation":[{"name":"Institute of Science Tokyo"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hirokatsu","family":"Kataoka","sequence":"additional","affiliation":[{"name":"National Institute of Advanced Industrial Science and Technology (AIST)"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Masaki","family":"Onishi","sequence":"additional","affiliation":[{"name":"National Institute of Advanced Industrial Science and Technology (AIST)"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yoshitaka","family":"Ushiku","sequence":"additional","affiliation":[{"name":"OMRON SINIC X, Place, Pincode"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Paddy doctor: A visual image dataset for automated paddy disease classification and benchmarking","author":"A","year":"2022","journal-title":"IEEE Dataport"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.compag.2022.107587"},{"key":"ref4","article-title":"Agrogpt: Efficient agricultural vision-language model with expert tuning","author":"Awais","year":"2024","journal-title":"arXiv preprint arXiv:2410.08405"},{"key":"ref5","article-title":"Qwen technical report","author":"Bai","year":"2023","journal-title":"arXiv preprint arXiv:2309.16609"},{"key":"ref6","article-title":"Are we on the right way for evaluating large vision-language models?","author":"Chen","year":"2024","journal-title":"arXiv preprint arXiv:2403.20330"},{"key":"ref7","article-title":"Are we on the right way for evaluating large vision-language models?","author":"Chen","year":"2024","journal-title":"arXiv preprint arXiv:2403.20330"},{"key":"ref8","article-title":"Pali: A jointly-scaled multilingual languageimage model","author":"Chen","year":"2023","journal-title":"Pali: A jointly-scaled multilingual languageimage model"},{"key":"ref9","article-title":"Tvbench: Redesigning video-language evaluation","author":"Cores","year":"2024","journal-title":"Tvbench: Redesigning video-language evaluation"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-023-01158-y"},{"key":"ref11","article-title":"Perrenial plants detection","author":"Geisler","year":"2021","journal-title":"Perrenial plants detection"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-16220-1_8"},{"key":"ref13","article-title":"Image classification for cssvd detection in cacao plants","author":"Jesse","year":"2024","journal-title":"arXiv preprint 2405.04535"},{"key":"ref14","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"In Proceedings of the 39th International Conference on Machine Learning","author":"Li"},{"key":"ref15","first-page":"19730","article-title":"BLIP-","volume-title":"In Proceedings of the 40th International Conference on Machine Learning","volume":"2","author":"Li"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"ref17","article-title":"Improved baselines with visual instruction tuning","author":"Liu","year":"2023","journal-title":"arXiv:2310.03744"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref19","article-title":"Llava-next: Improved reasoning, ocr, and world knowledge","author":"Liu","year":"2024","journal-title":"Llava-next: Improved reasoning, ocr, and world knowledge"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73016-0_10"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.3390\/s25144354"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/j.compag.2022.106696"},{"key":"ref23","article-title":"Cottonweeddet3","author":"Lu","year":"2022","journal-title":"Cottonweeddet3"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.3390\/rs12081246"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/j.dib.2023.109462"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.402"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093523"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.3389\/fpls.2016.014192016"},{"key":"ref30","article-title":"Video-bench: A comprehensive benchmark and toolkit for evaluating video-based large language models","author":"Ning","year":"2023","journal-title":"Video-bench: A comprehensive benchmark and toolkit for evaluating video-based large language models"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-018-38343-3"},{"key":"ref32","article-title":"OpenAI. Gpt-4o","year":"2024","journal-title":"Gpt-4o"},{"key":"ref33","article-title":"OpenAI. Gpt-4o mini","year":"2024","journal-title":"Gpt-4o mini"},{"key":"ref34","article-title":"Indian rice disease dataset (irdd)","author":"Pal","year":"2023","journal-title":"Indian rice disease dataset (irdd)"},{"key":"ref35","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"In ICML"},{"key":"ref36","article-title":"Multimedeval: A benchmark and a toolkit for evaluating medical vision-language models","author":"Royer","year":"2024","journal-title":"Multimedeval: A benchmark and a toolkit for evaluating medical vision-language models"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1016\/j.dib.2023.109952"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.dib.2023.109955"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.atech.2023.100196"},{"key":"ref40","article-title":"Sbs figures: Pre-training figure qa from stage-by-stage synthesized images","author":"Shinoda","year":"2024","journal-title":"Sbs figures: Pre-training figure qa from stage-by-stage synthesized images"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3371158.3371196"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00372"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00547"},{"key":"ref44","article-title":"Generative multimodal models are in-context learners","author":"Sun","year":"2023","journal-title":"arXiv preprint arXiv:2312.13286"},{"key":"ref45","article-title":"Generative pretraining in multimodality","author":"Sun","year":"2023","journal-title":"arXiv preprint arXiv:2307.05222"},{"key":"ref46","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Team","year":"2024","journal-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/j.dib.2024.110268"},{"key":"ref48","article-title":"Agri-llava: Knowledge-infused large multimodal assistant on agricultural pests and diseases","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv:2412.02158"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.3390\/s21051601"},{"key":"ref50","article-title":"Cogvlm: Visual expert for pretrained language models","author":"Wang","year":"2024","journal-title":"In NeurIPS"},{"key":"ref51","article-title":"Emu3: Next-token prediction is all you need","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv:2409.18869"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680599"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00899"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.3390\/s23146298"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref56","article-title":"Mmmu-pro: A more robust multi-discipline multimodal understanding benchmark","author":"Yue","year":"2024","journal-title":"arXiv preprint arXiv:2409.02813"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444932.pdf?arnumber=11444932","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:23:26Z","timestamp":1777530206000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444932\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":56,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.00716","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}