{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T03:38:30Z","timestamp":1773805110045,"version":"3.50.1"},"publisher-location":"Cham","reference-count":71,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729720","type":"print"},{"value":"9783031729737","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72973-7_20","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T14:03:04Z","timestamp":1730383384000},"page":"340-358","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["BenchLMM: Benchmarking Cross-Style Visual Capability of\u00a0Large Multimodal Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7114-8462","authenticated-orcid":false,"given":"Rizhao","family":"Cai","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5698-6315","authenticated-orcid":false,"given":"Zirui","family":"Song","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9752-1520","authenticated-orcid":false,"given":"Dayan","family":"Guan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9963-7559","authenticated-orcid":false,"given":"Zhenhao","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3147-4303","authenticated-orcid":false,"given":"Yaohang","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0879-1965","authenticated-orcid":false,"given":"Xing","family":"Luo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5002-6549","authenticated-orcid":false,"given":"Chenyu","family":"Yi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6262-8125","authenticated-orcid":false,"given":"Alex","family":"Kot","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, H., et al.: Nocaps: novel object captioning at scale. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8948\u20138957 (2019)","DOI":"10.1109\/ICCV.2019.00904"},{"key":"20_CR2","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VGA: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"20_CR4","unstructured":"Awadalla, A., et al.: Openflamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"20_CR5","doi-asserted-by":"crossref","unstructured":"Ben-Younes, H., Cadene, R., Cord, M., Thome, N.: Mutan: multimodal tucker fusion for visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2612\u20132620 (2017)","DOI":"10.1109\/ICCV.2017.285"},{"issue":"4","key":"20_CR6","doi-asserted-by":"publisher","first-page":"1038","DOI":"10.1007\/s11263-020-01400-4","volume":"129","author":"P Bergmann","year":"2021","unstructured":"Bergmann, P., Batzner, K., Fauser, M., Sattlegger, D., Steger, C.: The mvtec anomaly detection dataset: a comprehensive real-world dataset for unsupervised anomaly detection. Int. J. Comput. Vision 129(4), 1038\u20131059 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"20_CR7","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Cadene, R., Ben-Younes, H., Cord, M., Thome, N.: Murel: multimodal relational reasoning for visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1989\u20131998 (2019)","DOI":"10.1109\/CVPR.2019.00209"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Cao, Q., Wan, W., Wang, K., Liang, X., Lin, L.: Linguistically routing capsule network for out-of-distribution visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1614\u20131623 (2021)","DOI":"10.1109\/ICCV48922.2021.00164"},{"key":"20_CR10","unstructured":"Chen, J., et al.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning (2023)"},{"key":"20_CR11","unstructured":"Chen, L., et al.: Towards end-to-end embodied decision making via multi-modal large language model: explorations with gpt4-vision and beyond. arXiv preprint arXiv:2310.02071 (2023)"},{"key":"20_CR12","unstructured":"Chen, X., et al.: Microsoft coco captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"20_CR13","unstructured":"Chowdhery, A., et\u00a0al.: Palm: scaling language modeling with pathways. arXiv preprint arXiv:2204.02311 (2022)"},{"key":"20_CR14","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Dancette, C., Cadene, R., Teney, D., Cord, M.: Beyond question-based biases: assessing multimodal shortcut learning in visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1574\u20131583 (2021)","DOI":"10.1109\/ICCV48922.2021.00160"},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Ding, Y., Yu, J., Liu, B., Hu, Y., Cui, M., Wu, Q.: Mukea: multimodal knowledge extraction and accumulation for knowledge-based visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5089\u20135098 (2022)","DOI":"10.1109\/CVPR52688.2022.00503"},{"key":"20_CR17","unstructured":"Fu, C., et\u00a0al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847 (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Gao, F., Ping, Q., Thattai, G., Reganti, A., Wu, Y.N., Natarajan, P.: Transform-retrieve-generate: natural language-centric outside-knowledge visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5067\u20135077 (2022)","DOI":"10.1109\/CVPR52688.2022.00501"},{"key":"20_CR20","unstructured":"Ge, J., Luo, H., Qian, S., Gan, Y., Fu, J., Zhan, S.: Chain of thought prompt tuning in vision language models. arXiv preprint arXiv:2304.07919 (2023)"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6904\u20136913 (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Ha, Q., Watanabe, K., Karasawa, T., Ushiku, Y., Harada, T.: Mfnet: towards real-time semantic segmentation for autonomous vehicles with multi-spectral scenes. In: 2017 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 5108\u20135115. IEEE (2017)","DOI":"10.1109\/IROS.2017.8206396"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Han, X., Wang, S., Su, C., Huang, Q., Tian, Q.: Greedy gradient ensemble for robust visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1584\u20131593 (2021)","DOI":"10.1109\/ICCV48922.2021.00161"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GGA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"20_CR25","doi-asserted-by":"crossref","unstructured":"Hwang, S., Park, J., Kim, N., Choi, Y., So\u00a0Kweon, I.: Multispectral pedestrian detection: benchmark dataset and baseline. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1037\u20131045 (2015)","DOI":"10.1109\/CVPR.2015.7298706"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Jia, X., Zhu, C., Li, M., Tang, W., Zhou, W.: LLVIP: a visible-infrared paired dataset for low-light vision. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3496\u20133504 (2021)","DOI":"10.1109\/ICCVW54120.2021.00389"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Clevr: a diagnostic dataset for compositional language and elementary visual reasoning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2901\u20132910 (2017)","DOI":"10.1109\/CVPR.2017.215"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Inferring and executing programs for visual reasoning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2989\u20132998 (2017)","DOI":"10.1109\/ICCV.2017.325"},{"key":"20_CR29","unstructured":"Kim, J.H., Jun, J., Zhang, B.T.: Bilinear attention networks. Adv. Neural Inf. Process. Syst. 31 (2018)"},{"key":"20_CR30","unstructured":"Kim, J.H., On, K.W., Lim, W., Kim, J., Ha, J.W., Zhang, B.T.: Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1610.04325 (2016)"},{"key":"20_CR31","first-page":"22199","volume":"35","author":"T Kojima","year":"2022","unstructured":"Kojima, T., Gu, S.S., Reid, M., Matsuo, Y., Iwasawa, Y.: Large language models are zero-shot reasoners. Adv. Neural. Inf. Process. Syst. 35, 22199\u201322213 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR32","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Yang, J., Liu, Z.: Otter: a multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., Shan, Y.: Seed-bench: benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., Wen, J.R.: Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"20_CR36","unstructured":"Liang, P.P., et\u00a0al.: Multibench: multiscale benchmarks for multimodal representation learning. arXiv preprint arXiv:2107.07502 (2021)"},{"key":"20_CR37","doi-asserted-by":"crossref","unstructured":"Liu, B., Zhan, L.M., Xu, L., Ma, L., Yang, Y., Wu, X.M.: Slake: a semantically-labeled knowledge-enhanced dataset for medical visual question answering. In: 2021 IEEE 18th International Symposium on Biomedical Imaging (ISBI), pp. 1650\u20131654. IEEE (2021)","DOI":"10.1109\/ISBI48211.2021.9434010"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"20_CR39","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Liu, J., et al.: Target-aware dual adversarial learning and a multi-scenario multi-modality benchmark to fuse infrared and visible for object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5802\u20135811 (2022)","DOI":"10.1109\/CVPR52688.2022.00571"},{"key":"20_CR41","doi-asserted-by":"crossref","unstructured":"Liu, Y., et\u00a0al.: MMBench: is your multi-modal model an all-around player? arXiv preprint arXiv:2307.06281 (2023)","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"20_CR42","unstructured":"Manyika, J.: An overview of bard: an early experiment with generative ai. AI, Google Static Documents (2023)"},{"key":"20_CR43","doi-asserted-by":"crossref","unstructured":"Mao, X., et al.: COCO-O: a benchmark for object detectors under natural distribution shifts. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6339\u20136350 (2023)","DOI":"10.1109\/ICCV51070.2023.00583"},{"key":"20_CR44","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: Ok-vqa: a visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3195\u20133204 (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"20_CR45","doi-asserted-by":"crossref","unstructured":"Mitra, C., Huang, B., Darrell, T., Herzig, R.: Compositional chain-of-thought prompting for large multimodal models. arXiv preprint arXiv:2311.17076 (2023)","DOI":"10.1109\/CVPR52733.2024.01367"},{"key":"20_CR46","doi-asserted-by":"crossref","unstructured":"Nam, H., Ha, J.W., Kim, J.: Dual attention networks for multimodal reasoning and matching. In: Proceedings of the IEEE Conference on Computer Vision And Pattern Recognition, pp. 299\u2013307 (2017)","DOI":"10.1109\/CVPR.2017.232"},{"issue":"4","key":"20_CR47","doi-asserted-by":"publisher","first-page":"565","DOI":"10.3390\/mi13040565","volume":"13","author":"HD Nguyen","year":"2022","unstructured":"Nguyen, H.D., Cai, R., Zhao, H., Kot, A.C., Wen, B.: Towards more efficient security inspection via deep learning: a task-driven x-ray image cropping scheme. Micromachines 13(4), 565 (2022)","journal-title":"Micromachines"},{"key":"20_CR48","unstructured":"OpenAI, R.: Gpt-4 technical report. arXiv preprint arxiv:2303.08774. View in Article (2023)"},{"key":"20_CR49","doi-asserted-by":"crossref","unstructured":"Shah, M., Chen, X., Rohrbach, M., Parikh, D.: Cycle-consistency for robust visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6649\u20136658 (2019)","DOI":"10.1109\/CVPR.2019.00681"},{"key":"20_CR50","doi-asserted-by":"crossref","unstructured":"Shi, Z., Zhou, X., Qiu, X., Zhu, X.: Improving image captioning with better use of captions. arXiv preprint arXiv:2006.11807 (2020)","DOI":"10.18653\/v1\/2020.acl-main.664"},{"key":"20_CR51","doi-asserted-by":"crossref","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., Singh, A.: Textcaps: a dataset for image captioning with reading comprehension. In: ECCV 2020, Part II 16, pp. 742\u2013758. Springer, Cham (2020)","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"20_CR52","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"20_CR53","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: Lxmert: learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"20_CR54","doi-asserted-by":"crossref","unstructured":"Tascon-Morales, S., M\u00e1rquez-Neila, P., Sznitman, R.: Logical implications for visual question answering consistency. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6725\u20136735 (2023)","DOI":"10.1109\/CVPR52729.2023.00650"},{"key":"20_CR55","doi-asserted-by":"crossref","unstructured":"Teney, D., Abbasnejad, E., van\u00a0den Hengel, A.: Unshuffling data for improved generalization in visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1417\u20131427 (2021)","DOI":"10.1109\/ICCV48922.2021.00145"},{"key":"20_CR56","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"20_CR57","first-page":"200","volume":"34","author":"M Tsimpoukelli","year":"2021","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. Adv. Neural. Inf. Process. Syst. 34, 200\u2013212 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR58","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"20_CR59","doi-asserted-by":"crossref","unstructured":"Xia, G.S., et al.: Dota: a large-scale dataset for object detection in aerial images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3974\u20133983 (2018)","DOI":"10.1109\/CVPR.2018.00418"},{"key":"20_CR60","unstructured":"Xu, P., et al.: Lvlm-ehub: a comprehensive evaluation benchmark for large vision-language models. arXiv preprint arXiv:2306.09265 (2023)"},{"key":"20_CR61","doi-asserted-by":"crossref","unstructured":"Yang, X., Gao, C., Zhang, H., Cai, J.: Auto-parsing network for image captioning and visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2197\u20132207 (2021)","DOI":"10.1109\/ICCV48922.2021.00220"},{"key":"20_CR62","unstructured":"Yang, Z., et al.: The dawn of LMMs: preliminary explorations with GPT-4V (ision). arXiv preprint arXiv:2309.17421 (2023)"},{"key":"20_CR63","unstructured":"Yang, Z., et al.: Mm-react: prompting chatgpt for multimodal reasoning and action. arXiv preprint arXiv:2303.11381 (2023)"},{"key":"20_CR64","unstructured":"Ye, Q., et al.: mplug-owl: modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"20_CR65","doi-asserted-by":"publisher","unstructured":"Yu, F., et al.: Bdd100k: a diverse driving dataset for heterogeneous multitask learning. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2633\u20132642 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00271","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"20_CR66","doi-asserted-by":"crossref","unstructured":"Yu, S., Wu, P., Liang, P.P., Salakhutdinov, R., Morency, L.P.: PACS: a dataset for physical audiovisual commonsense reasoning. In: European Conference on Computer Vision, pp. 292\u2013309. Springer (2022)","DOI":"10.1007\/978-3-031-19836-6_17"},{"key":"20_CR67","unstructured":"Yu, W., et al.: Mm-vet: evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490 (2023)"},{"key":"20_CR68","doi-asserted-by":"crossref","unstructured":"Zellers, R., Bisk, Y., Farhadi, A., Choi, Y.: From recognition to cognition: visual commonsense reasoning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6720\u20136731 (2019)","DOI":"10.1109\/CVPR.2019.00688"},{"key":"20_CR69","unstructured":"Zhang, Z., Zhang, A., Li, M., Zhao, H., Karypis, G., Smola, A.: Multimodal chain-of-thought reasoning in language models. arXiv preprint arXiv:2302.00923 (2023)"},{"key":"20_CR70","doi-asserted-by":"crossref","unstructured":"Zhou, Y., et al.: Trar: routing the attention spans in transformer for visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2074\u20132084 (2021)","DOI":"10.1109\/ICCV48922.2021.00208"},{"key":"20_CR71","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72973-7_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,15]],"date-time":"2025-02-15T15:00:50Z","timestamp":1739631650000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72973-7_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9783031729720","9783031729737"],"references-count":71,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72973-7_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}