{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T08:59:11Z","timestamp":1768121951093,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":36,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819557608","type":"print"},{"value":"9789819557615","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5761-5_28","type":"book-chapter","created":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T05:52:43Z","timestamp":1768110763000},"page":"399-413","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MME-VirtualWorld: Simplifying Multimodal Assessment via\u00a0Programmable Synthetic Benchmarking"],"prefix":"10.1007","author":[{"given":"Zedong","family":"Liu","sequence":"first","affiliation":[]},{"given":"Li","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Shenao","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Xiang","family":"He","sequence":"additional","affiliation":[]},{"given":"Aoqi","family":"Fu","sequence":"additional","affiliation":[]},{"given":"Jiaxiang","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Shikun","family":"Feng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,12]]},"reference":[{"key":"28_CR1","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"28_CR2","unstructured":"Zhang, Y.F., et al.: Beyond LLaVA-HD: diving into high-resolution large multimodal models. arXiv preprint arXiv:2406.08487 (2024)"},{"key":"28_CR3","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., Donahue, J., Luc, P., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"28_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"370","DOI":"10.1007\/978-3-031-72643-9_22","volume-title":"Computer Vision - ECCV 2024","author":"L Chen","year":"2025","unstructured":"Chen, L., Li, J., Dong, X., et al.: ShareGPT4V: improving large multi-modal models with better captions. In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) ECCV 2024. LNCS, vol. 15075, pp. 370\u2013387. Springer, Cham (2025). https:\/\/doi.org\/10.1007\/978-3-031-72643-9_22"},{"key":"28_CR5","unstructured":"Brown, T., Mann, B., Ryder, N., et al.: Language models are few-shot learners. Adv. Neural Inf. Process. Syst. 33 (2020)"},{"key":"28_CR6","unstructured":"Yin, S., Fu, C., Zhao, S., et al.: A survey on multimodal large language models. arXiv preprint arXiv:2306.13549 (2023)"},{"key":"28_CR7","unstructured":"Chen, Z., Wang, W., Cao, Y., et al.: Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling. arXiv preprint arXiv:2412.05271 (2024)"},{"key":"28_CR8","unstructured":"Liu, A., Feng, B., Xue, B., et al.: Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)"},{"key":"28_CR9","unstructured":"Zhang, Y.F., Zhang, H., Tian, H., et al.: MME-RealWorld: could your multimodal LLM challenge high-resolution real-world scenarios that are difficult for humans? arXiv preprint arXiv:2408.13257 (2024)"},{"key":"28_CR10","unstructured":"Ying, K., Meng, F., Wang, J., et al.: MMT-Bench: a comprehensive multimodal benchmark for evaluating large vision-language models towards multitask AGI. arXiv preprint arXiv:2404.16006 (2024)"},{"key":"28_CR11","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"28_CR12","unstructured":"Dai, W., Li, J., Li, D., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"28_CR13","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26296\u201326306 (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"28_CR14","unstructured":"Fu, C., Chen, P., Shen, Y., et al.: MME: a comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394 (2023)"},{"key":"28_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"216","DOI":"10.1007\/978-3-031-72658-3_13","volume-title":"Computer Vision - ECCV 2024","author":"Y Liu","year":"2025","unstructured":"Liu, Y., Duan, H., Zhang, Y., et al.: MMbench: is your multi-modal model an all-around player? In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) ECCV 2024. LNCS, vol. 15064, pp. 216\u2013233. Springer, Cham (2025). https:\/\/doi.org\/10.1007\/978-3-031-72658-3_13"},{"key":"28_CR16","unstructured":"Li, B., Wang, R., Wang, G., et al.: SEED-bench: benchmarking multimodal LLMs with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)"},{"key":"28_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"148","DOI":"10.1007\/978-3-031-73337-6_9","volume-title":"Computer Vision - ECCV 2024","author":"X Fu","year":"2025","unstructured":"Fu, X., Hu, Y., Li, B., et al.: BLINK: multimodal large language models can see but not perceive. In: Leonardis, A., Ricci, E., Roth, S., Russakovsky, O., Sattler, T., Varol, G. (eds.) ECCV 2024. LNCS, vol. 15081, pp. 148\u2013166. Springer, Cham (2025). https:\/\/doi.org\/10.1007\/978-3-031-73337-6_9"},{"key":"28_CR18","doi-asserted-by":"crossref","unstructured":"Yue, X., Ni, Y., Zhang, K., et al.: MMMU: a massive multi-discipline multimodal understanding and reasoning benchmark for expert AGI. In: Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9556\u20139567 (2024)","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"28_CR19","unstructured":"Lu, P., Bansal, H., Xia, T., et al.: MathVista: evaluating mathematical reasoning of foundation models in visual contexts. arXiv preprint arXiv:2310.02255 (2023)"},{"key":"28_CR20","doi-asserted-by":"crossref","unstructured":"Johnson, J., Hariharan, B., Van Der Maaten, L., et al.: CLEVR: a diagnostic dataset for compositional language and elementary visual reasoning. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition, pp. 2901\u20132910 (2017)","DOI":"10.1109\/CVPR.2017.215"},{"issue":"7","key":"28_CR21","doi-asserted-by":"publisher","first-page":"2297","DOI":"10.1007\/s10994-022-06299-1","volume":"112","author":"O Struckmeier","year":"2023","unstructured":"Struckmeier, O., Tiwari, K., Kyrki, V.: Autoencoding slow representations for semi-supervised data-efficient regression. Mach. Learn. 112(7), 2297\u20132315 (2023)","journal-title":"Mach. Learn."},{"key":"28_CR22","unstructured":"Gan, C., Schwartz, J., Alter, S., et al.: ThreeDWorld: a platform for interactive multi-modal physical simulation. arXiv preprint arXiv:2007.04954 (2020)"},{"key":"28_CR23","unstructured":"Zhang, P., Dong, X., Wang, B., et al.: InternLM-XComposer: a vision-language large model for advanced text-image comprehension and composition. arXiv preprint arXiv:2309.15112 (2023)"},{"key":"28_CR24","doi-asserted-by":"crossref","unstructured":"Chen, Z., Wu, J., Wang, W., et al.: InternVL: scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: CVPR 2024, pp. 24185\u201324198. IEEE (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"28_CR25","unstructured":"Li, B., Zhang, K., Zhang, H., et al.: LLaVA-NeXT: stronger LLMs supercharge multimodal capabilities in the wild. arXiv preprint arXiv:2405.xxxxx (2024)"},{"key":"28_CR26","unstructured":"Li, Y., Zhang, Y., Wang, C., et al.: Mini-Gemini: mining the potential of multi-modality vision language models. arXiv preprint arXiv:2403.18814 (2024)"},{"key":"28_CR27","doi-asserted-by":"crossref","unstructured":"Li, Z., Yang, B., Liu, Q., et al.: Monkey: image resolution and text label are important things for large multi-modal models. In: CVPR 2024, pp. 26763\u201326773. IEEE (2024)","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"28_CR28","unstructured":"Bai, J., Bai, S., Yang, S., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"28_CR29","unstructured":"Lu, H., Liu, W., Zhang, B., et al.: DeepSeek-VL: towards real-world vision-language understanding. arXiv preprint arXiv:2403.05525 (2024)"},{"key":"28_CR30","doi-asserted-by":"crossref","unstructured":"Hu, A., Xu, H., Zhang, L., et al.: mPLUG-DocOwl2: high-resolution compressing for OCR-free multi-page document understanding. arXiv preprint arXiv:2409.03420 (2024)","DOI":"10.18653\/v1\/2025.acl-long.291"},{"key":"28_CR31","unstructured":"Yao, Y., Yu, T., Zhang, A., et al.: MiniCPM-V: a GPT-4V level MLLM on your phone. arXiv preprint arXiv:2408.01800 (2024)"},{"key":"28_CR32","unstructured":"Long, J., Dai, Y., Yang, G., et al.: Awaker2.5-VL: stably scaling MLLMs with parameter-efficient mixture of experts. arXiv preprint arXiv:2411.10669 (2024)"},{"key":"28_CR33","unstructured":"Hong, W., Wang, W., Ding, M., et al.: CogVLM2: visual language models for image and video understanding. arXiv preprint arXiv:2408.16500 (2024)"},{"key":"28_CR34","unstructured":"Wang, P., Bai, S., Tan, S., et al.: Qwen2-VL: enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)"},{"key":"28_CR35","unstructured":"Li, B., Zhang, Y., Guo, D., et al.: LLaVA-OneVision: easy visual task transfer. arXiv preprint arXiv:2408.03326 (2024)"},{"key":"28_CR36","first-page":"24824","volume":"35","author":"J Wei","year":"2022","unstructured":"Wei, J., Wang, X., Schuurmans, D., et al.: Chain-of-thought prompting elicits reasoning in large language models. Adv. Neural. Inf. Process. Syst. 35, 24824\u201324837 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5761-5_28","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,11]],"date-time":"2026-01-11T05:52:47Z","timestamp":1768110767000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5761-5_28"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819557608","9789819557615"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5761-5_28","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"12 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}