{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T08:33:10Z","timestamp":1768120390695,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":30,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819557608","type":"print"},{"value":"9789819557615","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5761-5_26","type":"book-chapter","created":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T05:52:42Z","timestamp":1768110762000},"page":"369-383","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["DeltaMMEval: A Contrastive Benchmark for\u00a0Fine-Grained Semantic Sensitivity in\u00a0Multimodal Models"],"prefix":"10.1007","author":[{"given":"Yan","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yiqiang","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,12]]},"reference":[{"key":"26_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"26_CR2","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR3","unstructured":"Anthropic: Claude 3.7 model overview (2025). https:\/\/www.anthropic.com\/index\/claude-3. Accessed June 2025"},{"key":"26_CR4","unstructured":"Anthropic: Claude 4 (opus 4 & sonnet 4) model overview (2025). https:\/\/www.anthropic.com\/news\/claude-4. Released 22 May 2025. Accessed via API"},{"key":"26_CR5","doi-asserted-by":"crossref","unstructured":"Chen, G., Shen, L., Shao, R., Deng, X., Nie, L.: Lion: empowering multimodal large language model with dual-level visual knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26540\u201326550 (2024)","DOI":"10.1109\/CVPR52733.2024.02506"},{"key":"26_CR6","unstructured":"Chen, X., et\u00a0al.: Pali: a jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794 (2022)"},{"issue":"12","key":"26_CR7","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5","volume":"67","author":"Z Chen","year":"2024","unstructured":"Chen, Z., et al.: How far are we to GPT-4v? Closing the gap to commercial multimodal models with open-source suites. Sci. China Inf. Sci. 67(12), 220101 (2024)","journal-title":"Sci. China Inf. Sci."},{"key":"26_CR8","doi-asserted-by":"crossref","unstructured":"Chen, Z., et\u00a0al.: InternVL: scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24185\u201324198 (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"26_CR9","unstructured":"Driess, D., et\u00a0al.: Palm-e: an embodied multimodal language model (2023)"},{"key":"26_CR10","first-page":"43593","volume":"36","author":"Z Gharaee","year":"2023","unstructured":"Gharaee, Z., et al.: A step towards worldwide biodiversity assessment: the Bioscan-1m insect dataset. Adv. Neural. Inf. Process. Syst. 36, 43593\u201343619 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR11","unstructured":"Grattafiori, A., et\u00a0al.: The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)"},{"key":"26_CR12","unstructured":"Hurst, A., et\u00a0al.: GPT-4o system card. arXiv preprint arXiv:2410.21276 (2024)"},{"issue":"8","key":"26_CR13","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3711680","volume":"57","author":"J Kuang","year":"2025","unstructured":"Kuang, J., et al.: Natural language understanding and inference with MLLM in visual question answering: a survey. ACM Comput. Surv. 57(8), 1\u201336 (2025)","journal-title":"ACM Comput. Surv."},{"key":"26_CR14","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1007\/s10590-009-9059-4","volume":"23","author":"A Lavie","year":"2009","unstructured":"Lavie, A., Denkowski, M.J.: The meteor metric for automatic evaluation of machine translation. Mach. Transl. 23, 105\u2013115 (2009)","journal-title":"Mach. Transl."},{"key":"26_CR15","doi-asserted-by":"crossref","unstructured":"Li, L., Lei, J., Gan, Z., Liu, J.: Adversarial VQA: a new benchmark for evaluating the robustness of VQA models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2042\u20132051 (2021)","DOI":"10.1109\/ICCV48922.2021.00205"},{"key":"26_CR16","unstructured":"Lin, C.: Recall-oriented understudy for Gisting evaluation (rouge) (2005). Retrieved 20 Aug 2005"},{"key":"26_CR17","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural. Inf. Process. Syst. 36, 34892\u201334916 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"26_CR18","unstructured":"Lu, H., et al.: Deepseek-VL: towards real-world vision-language understanding (2024)"},{"key":"26_CR19","unstructured":"OpenAI: GPT-4 with vision (2023). https:\/\/openai.com\/index\/gpt-4#vision. Accessed via \u2018gpt-image-1\u2018 API"},{"key":"26_CR20","unstructured":"OpenAI: Gpt-4.1 technical overview (2024). https:\/\/openai.com. Accessed via OpenAI API"},{"key":"26_CR21","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BleU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"26_CR22","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"26_CR23","doi-asserted-by":"crossref","unstructured":"Raj, C., Mukherjee, A., Caliskan, A., Anastasopoulos, A., Zhu, Z.: Biasdora: exploring hidden biased associations in vision-language models. arXiv preprint arXiv:2407.02066 (2024)","DOI":"10.18653\/v1\/2024.findings-emnlp.611"},{"key":"26_CR24","unstructured":"Team, G., et\u00a0al.: Gemini 1.5: unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)"},{"key":"26_CR25","unstructured":"Wang, G., et al.: Voyager: an open-ended embodied agent with large language models. arXiv preprint arXiv:2305.16291 (2023)"},{"key":"26_CR26","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Logo-2k+: a large-scale logo dataset for scalable logo classification. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 6194\u20136201 (2020)","DOI":"10.1609\/aaai.v34i04.6085"},{"key":"26_CR27","unstructured":"Wang, P., et\u00a0al.: Qwen2-VL: enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)"},{"key":"26_CR28","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Multimodal LLM enhanced cross-lingual cross-modal retrieval. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 8296\u20138305 (2024)","DOI":"10.1145\/3664647.3680886"},{"key":"26_CR29","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: BERTScore: evaluating text generation with BERT. arXiv preprint arXiv:1904.09675 (2019)"},{"key":"26_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"73","DOI":"10.1007\/978-3-031-72649-1_5","volume-title":"Computer Vision - ECCV 2024","author":"K Zheng","year":"2024","unstructured":"Zheng, K., et al.: DreamLIP: language-image pre-training with long captions. In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) ECCV 2024. LNCS, vol. 15076, pp. 73\u201390. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-72649-1_5"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5761-5_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T05:52:45Z","timestamp":1768110765000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5761-5_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819557608","9789819557615"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5761-5_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"12 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}