{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,17]],"date-time":"2026-05-17T02:54:30Z","timestamp":1778986470388,"version":"3.51.4"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730153","type":"print"},{"value":"9783031730160","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73016-0_10","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T19:02:40Z","timestamp":1729882960000},"page":"157-170","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["A Multimodal Benchmark Dataset and\u00a0Model for\u00a0Crop Disease Diagnosis"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2492-403X","authenticated-orcid":false,"given":"Xiang","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1267-0277","authenticated-orcid":false,"given":"Zhaoxiang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6697-3220","authenticated-orcid":false,"given":"Huan","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6796-6043","authenticated-orcid":false,"given":"Zezhou","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5863-2288","authenticated-orcid":false,"given":"Kohou","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1171-0281","authenticated-orcid":false,"given":"Kai","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4308-7049","authenticated-orcid":false,"given":"Shiguo","family":"Lian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"10_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., et\u00a0al.: Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"10_CR2","doi-asserted-by":"crossref","unstructured":"Agarwal, M., Sinha, A., Gupta, S.K., Mishra, D., Mishra, R.: Potato crop disease classification using convolutional neural network. In: Smart Systems and IoT: Innovations in Computing: Proceeding of SSIC 2019. pp. 391\u2013400. Springer (2020)","DOI":"10.1007\/978-981-13-8406-6_37"},{"key":"10_CR3","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR4","unstructured":"Anil, R., Dai, A.M., Firat, O., Johnson, M., Lepikhin, D., Passos, A., Shakeri, S., Taropa, E., Bailey, P., Chen, Z., et\u00a0al.: Palm 2 technical report. arXiv preprint arXiv:2305.10403 (2023)"},{"key":"10_CR5","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: Vqa: Visual question answering. In: Proceedings of the IEEE international conference on computer vision. pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"10_CR6","doi-asserted-by":"crossref","unstructured":"Arya, S., Singh, R.: A comparative study of cnn and alexnet for detection of disease in potato and mango leaf. In: 2019 International conference on issues and challenges in intelligent computing techniques (ICICT). vol.\u00a01, pp.\u00a01\u20136. IEEE (2019)","DOI":"10.1109\/ICICT46931.2019.8977648"},{"key":"10_CR7","unstructured":"Askell, A., Bai, Y., Chen, A., Drain, D., Ganguli, D., Henighan, T., Jones, A., Joseph, N., Mann, B., DasSarma, N., et\u00a0al.: A general language assistant as a laboratory for alignment. arXiv preprint arXiv:2112.00861 (2021)"},{"key":"10_CR8","unstructured":"Bai, J., Bai, S., Chu, Y., Cui, Z., Dang, K., Deng, X., Fan, Y., Ge, W., Han, Y., Huang, F., et\u00a0al.: Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"10_CR9","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., Zhou, J.: Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond (2023)"},{"key":"10_CR10","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et\u00a0al.: Language models are few-shot learners advances in neural information processing systems 33 (2020)"},{"key":"10_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"10_CR12","doi-asserted-by":"publisher","DOI":"10.1016\/j.atech.2022.100108","volume":"3","author":"L Divyanth","year":"2023","unstructured":"Divyanth, L., Ahmad, A., Saraswat, D.: A two-stage deep-learning based segmentation model for crop disease quantification based on corn field imagery. Smart Agricultural Technology 3, 100108 (2023)","journal-title":"Smart Agricultural Technology"},{"key":"10_CR13","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847 (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"10_CR14","doi-asserted-by":"crossref","unstructured":"Gan, Z., Li, L., Li, C., Wang, L., Liu, Z., Gao, J., et\u00a0al.: Vision-language pre-training: Basics, recent advances, and future trends. Foundations and Trends\u00ae in Computer Graphics and Vision 14(3\u20134), 163\u2013352 (2022)","DOI":"10.1561\/0600000105"},{"key":"10_CR15","unstructured":"Gao, P., Han, J., Zhang, R., Lin, Z., Geng, S., Zhou, A., Zhang, W., Lu, P., He, C., Yue, X., et\u00a0al.: Llama-adapter v2: Parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010 (2023)"},{"key":"10_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"10_CR17","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., Chen, W.: Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"10_CR18","doi-asserted-by":"publisher","first-page":"818","DOI":"10.1007\/s00034-019-01041-0","volume":"39","author":"A Khamparia","year":"2020","unstructured":"Khamparia, A., Saini, G., Gupta, D., Khanna, A., Tiwari, S., de Albuquerque, V.H.C.: Seasonal crops disease prediction and classification using deep convolutional encoder network. Circuits Systems Signal Process. 39, 818\u2013836 (2020)","journal-title":"Circuits Systems Signal Process."},{"key":"10_CR19","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems 25 (2012)"},{"key":"10_CR20","doi-asserted-by":"publisher","first-page":"1064399","DOI":"10.3389\/fpls.2022.1064399","volume":"13","author":"Y Lan","year":"2023","unstructured":"Lan, Y., Guo, Y., Chen, Q., Lin, S., Chen, Y., Deng, X.: Visual question answering model for fruit tree disease decision-making based on multimodal deep learning. Front. Plant Sci. 13, 1064399 (2023)","journal-title":"Front. Plant Sci."},{"key":"10_CR21","first-page":"9287","volume":"35","author":"C Li","year":"2022","unstructured":"Li, C., Liu, H., Li, L., Zhang, P., Aneja, J., Yang, J., Jin, P., Hu, H., Liu, Z., Lee, Y.J., et al.: Elevater: A benchmark and toolkit for evaluating language-augmented visual models. Adv. Neural. Inf. Process. Syst. 35, 9287\u20139301 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR22","unstructured":"Li, C., Wong, C., Zhang, S., Usuyama, N., Liu, H., Yang, J., Naumann, T., Poon, H., Gao, J.: Llava-med: Training a large language-and-vision assistant for biomedicine in one day. Advances in Neural Information Processing Systems 36 (2024)"},{"key":"10_CR23","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"10_CR24","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Advances in neural information processing systems 36 (2024)"},{"key":"10_CR25","doi-asserted-by":"crossref","unstructured":"Morbekar, A., Parihar, A., Jadhav, R.: Crop disease detection using yolo. In: 2020 international conference for emerging technology (INCET). pp.\u00a01\u20135. IEEE (2020)","DOI":"10.1109\/INCET49848.2020.9153986"},{"key":"10_CR26","unstructured":"Peng, B., Li, C., He, P., Galley, M., Gao, J.: Instruction tuning with gpt-4. arXiv preprint arXiv:2304.03277 (2023)"},{"key":"10_CR27","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: Unified, real-time object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 779\u2013788 (2016)","DOI":"10.1109\/CVPR.2016.91"},{"issue":"11","key":"10_CR28","doi-asserted-by":"publisher","first-page":"468","DOI":"10.3390\/plants8110468","volume":"8","author":"MH Saleem","year":"2019","unstructured":"Saleem, M.H., Potgieter, J., Arif, K.M.: Plant disease detection and classification by deep learning. Plants 8(11), 468 (2019)","journal-title":"Plants"},{"key":"10_CR29","doi-asserted-by":"publisher","DOI":"10.3389\/fpls.2021.809506","volume":"12","author":"D Yang","year":"2021","unstructured":"Yang, D., Wang, F., Hu, Y., Lan, Y., Deng, X.: Citrus huanglongbing detection based on multi-modal feature fusion learning. Front. Plant Sci. 12, 809506 (2021)","journal-title":"Front. Plant Sci."},{"key":"10_CR30","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73016-0_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T19:04:46Z","timestamp":1729883086000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73016-0_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031730153","9783031730160"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73016-0_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}