{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T17:12:39Z","timestamp":1775841159008,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","funder":[{"name":"STL 2030 Major Projects","award":["2022120020403"],"award-info":[{"award-number":["2022120020403"]}]},{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LDF030002"],"award-info":[{"award-number":["LDF030002"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792971","type":"proceedings-article","created":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T21:54:39Z","timestamp":1775771679000},"page":"9800-9810","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["AgriGPT-Omni: A Unified Speech\u2013Vision\u2013Text Framework for Multilingual Agricultural Intelligence"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8929-1007","authenticated-orcid":false,"given":"Bo","family":"Yang","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7203-3095","authenticated-orcid":false,"given":"Lanfei","family":"Feng","sequence":"additional","affiliation":[{"name":"Zhejiang University, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0789-1808","authenticated-orcid":false,"given":"Yunkui","family":"Chen","sequence":"additional","affiliation":[{"name":"Zhejiang University, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0642-1533","authenticated-orcid":false,"given":"Yu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7004-9156","authenticated-orcid":false,"given":"Jianyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2433-1010","authenticated-orcid":false,"given":"Xiao","family":"Xu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0026-026X","authenticated-orcid":false,"given":"Nueraili","family":"Aierken","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5846-3065","authenticated-orcid":false,"given":"Shijian","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Josh Achiam Susan Adler Sandhini Agarwal et al. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023). https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_2_1_2_1","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr et al. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. In Advances in Neural Information Processing Systems (NeurIPS). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/960a172bc7fbf0177ccccbb411a7d800-Paper-Conference.pdf"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of The 12th Language Resources and Evaluation Conference (LREC). 4218-4222","author":"Ardila Rosana","year":"2020","unstructured":"Rosana Ardila, Megan Branson, Kelly Davis, Michael Henretty, Michael Kohler, Josh Meyer, Reuben Morais, Lindsay Saunders, Francis M. Tyers, and Gregor Weber. 2020. Common Voice: A Massively-Multilingual Speech Corpus. In Proceedings of The 12th Language Resources and Evaluation Conference (LREC). 4218-4222."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00555"},{"key":"e_1_3_2_1_5_1","unstructured":"Shuai Bai Yuxuan Cai Ruizhe Chen Keqin Chen Xionghui Chen Zesen Cheng Lianghao Deng Wei Ding Chang Gao Chunjiang Ge Wenbin Ge Zhifang Guo Qidong Huang Jie Huang Fei Huang Binyuan Hui Shutong Jiang Zhaohai Li Mingsheng Li Mei Li Kaixin Li Zicheng Lin Junyang Lin Xuejing Liu Jiawei Liu Chenglong Liu Yang Liu Dayiheng Liu Shixuan Liu Dunjie Lu Ruilin Luo Chenxu Lv Rui Men Lingchen Meng Xuancheng Ren Xingzhang Ren Sibo Song Yuchong Sun Jun Tang Jianhong Tu Jianqiang Wan Peng Wang Pengfei Wang Qiuyue Wang Yuxuan Wang Tianbao Xie Yiheng Xu Haiyang Xu Jin Xu Zhibo Yang Mingkun Yang Jianxin Yang An Yang Bowen Yu Fei Zhang Hang Zhang Xi Zhang Bo Zheng Humen Zhong Jingren Zhou Fan Zhou Jing Zhou Yuanzhi Zhu and Ke Zhu. 2025a. Qwen3-VL Technical Report. arXiv preprint arXiv:2511.21631 (2025)."},{"key":"e_1_3_2_1_6_1","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et al. 2025b. Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)."},{"key":"e_1_3_2_1_7_1","unstructured":"Lo\u00efc Barrault et al. 2023. SeamlessM4T: Massively Multilingual & Multimodal Machine Translation. arXiv preprint arXiv:2308.11596 (2023)."},{"key":"e_1_3_2_1_8_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems (NeurIPS). https:\/\/proceedings.neurips.cc\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"e_1_3_2_1_9_1","volume-title":"SpeechStew: Simply Mix All Available Speech Recognition Data to Train One Large Neural Network. arXiv preprint arXiv:2104.02133","author":"Chan William","year":"2021","unstructured":"William Chan, Daniel Park, Chris Lee, and Yu Zhang. 2021. SpeechStew: Simply Mix All Available Speech Recognition Data to Train One Large Neural Network. arXiv preprint arXiv:2104.02133 (2021)."},{"key":"e_1_3_2_1_10_1","unstructured":"Yunfei Chu Jin Xu Qian Yang Haojie Wei Xipin Wei Zhifang Guo Yichong Leng Yuanjun Lv Jinzheng He Junyang Lin et al. 2024. Qwen2-audio technical report. arXiv preprint arXiv:2407.10759 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing. 4570-4581","author":"Clark Peter","year":"2018","unstructured":"Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Clare Schoenick, and Oyvind Tafjord. 2018. Think you have solved Question Answering? Try ARC, the AI2 Reasoning Challenge. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing. 4570-4581."},{"key":"e_1_3_2_1_12_1","unstructured":"Gheorghe Comanici Eric Bieber Mike Schaekermann Ice Pasupat Noveen Sachdeva Inderjit Dhillon Marcel Blistein Ori Ram Dan Zhang Evan Rosen et al. 2025. Gemini 2.5: Pushing the frontier with advanced reasoning multimodality long context and next generation agentic capabilities. arXiv preprint arXiv:2507.06261 (2025)."},{"key":"e_1_3_2_1_13_1","volume-title":"FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech. arXiv preprint arXiv:2205.12446","author":"Conneau Alexis","year":"2022","unstructured":"Alexis Conneau, Min Ma, Simran Khanuja, Yu Zhang, Vera Axelrod, Siddharth Dalmia, Jason Riesa, Clara Rivera, and Ankur Bapna. 2022. FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech. arXiv preprint arXiv:2205.12446 (2022)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.nlp4pi-1.16"},{"key":"e_1_3_2_1_15_1","volume-title":"AgMMU: A Comprehensive Agricultural Multimodal Understanding and Reasoning Benchmark. arXiv preprint arXiv:2504.10568","author":"Gauba Aruna","year":"2025","unstructured":"Aruna Gauba, Irene Pi, Yunze Man, Ziqi Pang, Vikram S. Adve, and Yu-Xiong Wang. 2025. AgMMU: A Comprehensive Agricultural Multimodal Understanding and Reasoning Benchmark. arXiv preprint arXiv:2504.10568 (2025)."},{"key":"e_1_3_2_1_16_1","volume-title":"Gemini: A Family of Highly Capable Multimodal Models. arXiv preprint arXiv:2312.11805","author":"Team Gemini","year":"2023","unstructured":"Gemini Team. 2023. Gemini: A Family of Highly Capable Multimodal Models. arXiv preprint arXiv:2312.11805 (2023). https:\/\/arxiv.org\/abs\/2312.11805"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Yash Goyal Tejas Khot Douglas Summers-Stay Dhruv Batra and Devi Parikh. 2017. Making the V in VQA Matter: Elevating the Role of Image Understanding. In CVPR. https:\/\/openaccess.thecvf.com\/content_cvpr_2017\/papers\/Goyal_Making_the_v_CVPR_2017_paper.pdf","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_18_1","unstructured":"Hadi Hashemi Akhil Arora Zheng Li et al. 2024. LLM-RUBRIC: A Multidimensional Calibrated Approach to Automated Evaluation. In ACL (Long Papers). https:\/\/aclanthology.org\/2024.acl-long.745.pdf"},{"key":"e_1_3_2_1_19_1","volume-title":"Measuring Massive Multitask Language Understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2021. Measuring Massive Multitask Language Understanding. arXiv preprint arXiv:2009.03300 (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"Step-audio: Unified understanding and generation in intelligent speech interaction. arXiv preprint arXiv:2502.11946","author":"Huang Ailin","year":"2025","unstructured":"Ailin Huang, Boyong Wu, Bruce Wang, Chao Yan, Chen Hu, Chengli Feng, Fei Tian, Feiyu Shen, Jingbei Li, Mingrui Chen, et al., 2025. Step-audio: Unified understanding and generation in intelligent speech interaction. arXiv preprint arXiv:2502.11946 (2025)."},{"key":"e_1_3_2_1_21_1","volume-title":"Barun Patra, et al.","author":"Huang Shaohan","year":"2023","unstructured":"Shaohan Huang, Li Dong, Wenhui Wang, Yaru Hao, Saksham Singhal, Shuming Ma, Tengchao Lv, Lei Cui, Owais Khan Mohammed, Barun Patra, et al., 2023. Language Is Not All You Need: Aligning Perception with Language Models. arXiv preprint arXiv:2302.14045 (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Manning","author":"Hudson Drew A.","year":"2019","unstructured":"Drew A. Hudson and Christopher D. Manning. 2019. GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering. In CVPR. https:\/\/openaccess.thecvf.com\/content_CVPR_2019\/papers\/Hudson_GQA_A_New_Dataset_for_Real-World_Visual_Reasoning_and_Compositional_CVPR_2019_paper.pdf"},{"key":"e_1_3_2_1_23_1","unstructured":"David Hughes Marcel Salath\u00e9 et al. 2015. An open access repository of images on plant health to enable the development of mobile disease diagnostics. arXiv preprint arXiv:1511.08060 (2015)."},{"key":"e_1_3_2_1_24_1","first-page":"707","article-title":"Binary Codes Capable of Correcting Deletions, Insertions, and Reversals","volume":"10","author":"Levenshtein Vladimir I.","year":"1966","unstructured":"Vladimir I. Levenshtein. 1966. Binary Codes Capable of Correcting Deletions, Insertions, and Reversals. Soviet Physics Doklady, Vol. 10, 8 (1966), 707-710. https:\/\/nymity.ch\/sybilhunting\/pdf\/Levenshtein1966a.pdf","journal-title":"Soviet Physics Doklady"},{"key":"e_1_3_2_1_25_1","unstructured":"Boyu Li et al. 2023a. SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension. arXiv:2307.16125 (2023). https:\/\/arxiv.org\/abs\/2307.16125"},{"key":"e_1_3_2_1_26_1","unstructured":"Boxun Li Yadong Li Zhiyuan Li Congyi Liu Weilin Liu Guowei Niu Zheyue Tan Haiyang Xu Zhuyu Yao Tao Yuan et al. 2025. Megrez-omni technical report. arXiv preprint arXiv:2502.15803 (2025)."},{"key":"e_1_3_2_1_27_1","volume-title":"Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125","author":"Li Bohao","year":"2023","unstructured":"Bohao Li, Rui Wang, Guangzhi Wang, Yuying Ge, Yixiao Ge, and Ying Shan. 2023b. Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)."},{"key":"e_1_3_2_1_28_1","volume-title":"Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023b. Improved Baselines with Visual Instruction Tuning. arXiv preprint arXiv:2310.03744 (2023)."},{"key":"e_1_3_2_1_29_1","unstructured":"Yang Liu Yifan Li et al. 2023a. MMBench: Is Your Multi-Modal Model an All-Around Player? arXiv:2307.06281 (2023). https:\/\/arxiv.org\/abs\/2307.06281"},{"key":"e_1_3_2_1_30_1","volume-title":"Multimodal Large Language Models for Intelligent Diagnosis and Management of Crop Nutrient Deficiencies and Environmental Stresses. arXiv preprint arXiv:2502.06789","author":"Long Zhen","year":"2025","unstructured":"Zhen Long and Ming Rao. 2025. Multimodal Large Language Models for Intelligent Diagnosis and Management of Crop Nutrient Deficiencies and Environmental Stresses. arXiv preprint arXiv:2502.06789 (2025)."},{"key":"e_1_3_2_1_31_1","volume-title":"Ziya-Visual: Bilingual Large Vision-Language Model via Multi-Task Instruction Tuning. arXiv preprint arXiv:2310.08166","author":"Lu Junyu","year":"2023","unstructured":"Junyu Lu, Dixiang Zhang, Xiaojun Wu, Xinyu Gao, Ruyi Gan, Jiaxing Zhang, Yan Song, and Pingjian Zhang. 2023. Ziya-Visual: Bilingual Large Vision-Language Model via Multi-Task Instruction Tuning. arXiv preprint arXiv:2310.08166 (2023). https:\/\/arxiv.org\/abs\/2310.08166"},{"key":"e_1_3_2_1_32_1","unstructured":"Pan Lu Swaroop Mishra Tony Xia Liang Qiu Kai-Wei Chang Song-Chun Zhu Oyvind Tafjord Peter Clark and Ashwin Kalyan. 2022. Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering. In NeurIPS. https:\/\/scienceqa.github.io\/"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1260"},{"key":"e_1_3_2_1_34_1","unstructured":"Vineel Pratap Andros Tjandra Bowen Shi et al. 2023. Scaling Speech Technology to 1 000 Languages. arXiv preprint arXiv:2305.13516 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Mls: A large-scale multilingual dataset for speech research. arXiv preprint arXiv:2012.03411","author":"Pratap Vineel","year":"2020","unstructured":"Vineel Pratap, Qiantong Xu, Anuroop Sriram, Gabriel Synnaeve, and Ronan Collobert. 2020. Mls: A large-scale multilingual dataset for speech research. arXiv preprint arXiv:2012.03411 (2020)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/715"},{"key":"e_1_3_2_1_38_1","volume-title":"AgroLLM: Connecting Farmers and Agricultural Practices through Large Language Models for Enhanced Knowledge Transfer and Practical Application. arXiv preprint arXiv:2503.04788","author":"Samuel Dinesh Jackson","year":"2025","unstructured":"Dinesh Jackson Samuel, Inna Skarga-Bandurova, David Sikolia, and Muhammad Awais. 2025. AgroLLM: Connecting Farmers and Agricultural Practices through Large Language Models for Enhanced Knowledge Transfer and Practical Application. arXiv preprint arXiv:2503.04788 (2025). https:\/\/arxiv.org\/abs\/2503.04788"},{"key":"e_1_3_2_1_39_1","volume-title":"Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347","author":"Schulman John","year":"2017","unstructured":"John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. 2017. Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)."},{"key":"e_1_3_2_1_40_1","unstructured":"Zheng Shao et al. 2024. DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. arXiv preprint arXiv:2402.03300 (2024). https:\/\/arxiv.org\/abs\/2402.03300"},{"key":"e_1_3_2_1_41_1","volume-title":"AgroBench: Vision-Language Model Benchmark in Agriculture. arXiv preprint arXiv:2507.20519","author":"Shinoda Risa","year":"2025","unstructured":"Risa Shinoda, Nakamasa Inoue, Hirokatsu Kataoka, Masaki Onishi, and Yoshitaka Ushiku. 2025. AgroBench: Vision-Language Model Benchmark in Agriculture. arXiv preprint arXiv:2507.20519 (2025)."},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (ACL).","author":"Suwon","unstructured":"Suwon Shon et al., 2023. SLUE Phase-2: A Benchmark Suite of Diverse Spoken Language Understanding Tasks. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (ACL)."},{"key":"e_1_3_2_1_43_1","volume-title":"SLUE: New Benchmark Tasks for Spoken Language Understanding Evaluation on Natural Speech. arXiv preprint arXiv:2111.10367","author":"Shon Suwon","year":"2021","unstructured":"Suwon Shon, Santiago Pascual, et al., 2021. SLUE: New Benchmark Tasks for Spoken Language Understanding Evaluation on Natural Speech. arXiv preprint arXiv:2111.10367 (2021)."},{"key":"e_1_3_2_1_44_1","unstructured":"Virginia Teller. 2000. Speech and language processing: an introduction to natural language processing computational linguistics and speech recognition."},{"key":"e_1_3_2_1_45_1","volume-title":"Semi-Supervised Learning and Interpretation. arXiv preprint arXiv:2101.00390","author":"Wang Changhan","year":"2021","unstructured":"Changhan Wang, Morgane Riviere, Ann Lee, Anne Wu, Chaitanya Talnikar, Daniel Haziza, Mary Williamson, Juan Pino, and Emmanuel Dupoux. 2021. VoxPopuli: A Large-Scale Multilingual Speech Corpus for Representation Learning, Semi-Supervised Learning and Interpretation. arXiv preprint arXiv:2101.00390 (2021)."},{"key":"e_1_3_2_1_46_1","unstructured":"Xiaoping Wu Chi Zhan Yu-Kun Lai Ming-Ming Cheng and Jufeng Yang. 2019. IP102: A Large-Scale Benchmark Dataset for Insect Pest Recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). https:\/\/openaccess.thecvf.com\/content_CVPR_2019\/html\/Wu_IP102_A_Large-Scale_Benchmark_Dataset_for_Insect_Pest_Recognition_CVPR_2019_paper.html"},{"key":"e_1_3_2_1_47_1","unstructured":"Zhiyu Wu et al. 2024. DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding. arXiv preprint arXiv:2412.10302 (2024)."},{"key":"e_1_3_2_1_48_1","unstructured":"Jin Xu Zhifang Guo Jinzheng He Hangrui Hu Ting He Shuai Bai Keqin Chen Jialin Wang Yang Fan Kai Dang et al. 2025. Qwen2. 5-omni technical report. arXiv preprint arXiv:2503.20215 (2025)."},{"key":"e_1_3_2_1_49_1","volume-title":"AgriGPT-VL: Agricultural Vision-Language Understanding Suite. arXiv preprint arXiv:2503.01234","author":"Yang Bo","year":"2025","unstructured":"Bo Yang, Yibo Chen, and Liang Feng. 2025a. AgriGPT-VL: Agricultural Vision-Language Understanding Suite. arXiv preprint arXiv:2503.01234 (2025)."},{"key":"e_1_3_2_1_50_1","volume-title":"AgriGPT: A Large Language Model Ecosystem for Agriculture. arXiv preprint arXiv:2508.08632","author":"Yang Bo","year":"2025","unstructured":"Bo Yang, Yu Zhang, Lanfei Feng, Yunkui Chen, Jianyu Zhang, Xiao Xu, Nueraili Aierken, Yurui Li, Yuxuan Chen, Guijun Yang, Yong He, Runhe Huang, and Shijian Li. 2025b. AgriGPT: A Large Language Model Ecosystem for Agriculture. arXiv preprint arXiv:2508.08632 (2025). https:\/\/arxiv.org\/abs\/2508.08632"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775"},{"key":"e_1_3_2_1_52_1","volume-title":"Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800","author":"Yao Yuan","year":"2024","unstructured":"Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji Zhu, Tianchi Cai, Haoyu Li, Weilin Zhao, Zhihui He, et al., 2024. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Yi: Open Foundation Models by 01.AI. arXiv preprint arXiv:2403.04652","author":"Andy Young","year":"2024","unstructured":"Andy Young et al., 2024. Yi: Open Foundation Models by 01.AI. arXiv preprint arXiv:2403.04652 (2024)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.3390\/electronics14102087"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Xiang Yue et al. 2024. MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert-Level Complex Tasks. In CVPR. https:\/\/openaccess.thecvf.com\/content\/CVPR2024\/papers\/Yue_MMMU_A_Massive_Multi-discipline_Multimodal_Understanding_and_Reasoning_Benchmark_for_CVPR_2024_paper.pdf","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"e_1_3_2_1_56_1","volume-title":"AgriDoctor: A Multimodal Intelligent Assistant for Agriculture. arXiv preprint arXiv:2501.05678","author":"Zhang Ming","year":"2025","unstructured":"Ming Zhang, Zhi Xu, and Peng Wang. 2025. AgriDoctor: A Multimodal Intelligent Assistant for Agriculture. arXiv preprint arXiv:2501.05678 (2025)."},{"key":"e_1_3_2_1_57_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric P. Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena. In NeurIPS. https:\/\/arxiv.org\/abs\/2306.05685"},{"key":"e_1_3_2_1_58_1","volume-title":"AgriBench: A Hierarchical Agriculture Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2409.12345","author":"Zhou Yang","year":"2024","unstructured":"Yang Zhou and Masashi Ryo. 2024. AgriBench: A Hierarchical Agriculture Benchmark for Multimodal Large Language Models. arXiv preprint arXiv:2409.12345 (2024)."},{"key":"e_1_3_2_1_59_1","unstructured":"Jinguo Zhu Weiyun Wang Zhe Chen Zhaoyang Liu Shenglong Ye Lixin Gu Hao Tian Yuchen Duan Weijie Su Jie Shao et al. 2025. Internvl3: Exploring advanced training and test-time recipes for open-source multimodal models. arXiv preprint arXiv:2504.10479 (2025)."}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"deposited":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:30:28Z","timestamp":1775838628000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792971"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":59,"alternative-id":["10.1145\/3774904.3792971","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792971","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}