{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T03:31:05Z","timestamp":1773804665047,"version":"3.50.1"},"reference-count":476,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T00:00:00Z","timestamp":1763424000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T00:00:00Z","timestamp":1763424000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1007\/s11432-025-4676-4","type":"journal-article","created":{"date-parts":[[2025,11,20]],"date-time":"2025-11-20T13:35:26Z","timestamp":1763645726000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Large multimodal models evaluation: a survey"],"prefix":"10.1007","volume":"68","author":[{"given":"Zicheng","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Junying","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Farong","family":"Wen","sequence":"additional","affiliation":[]},{"given":"Yijin","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Xiangyu","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Xinyu","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Shengyuan","family":"Ding","sequence":"additional","affiliation":[]},{"given":"Ziheng","family":"Jia","sequence":"additional","affiliation":[]},{"given":"Jiahao","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Ye","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Yushuo","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Xiaorong","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Yalun","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Ziheng","family":"Jiao","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Zijian","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Kaiwei","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Kang","family":"Fu","sequence":"additional","affiliation":[]},{"given":"Yuqin","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Ming","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Yue","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Xuemei","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Juntai","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Jinyu","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Ronghui","family":"Li","sequence":"additional","affiliation":[]},{"given":"Donghao","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Yuan","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Xiangyang","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Chunyi","family":"Li","sequence":"additional","affiliation":[]},{"given":"Haoning","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Xiaohong","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Junjun","family":"He","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Hui","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Lin","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Zesheng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Huiyu","family":"Duan","sequence":"additional","affiliation":[]},{"given":"Yingjie","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Xiongkuo","family":"Min","sequence":"additional","affiliation":[]},{"given":"Qi","family":"Jia","sequence":"additional","affiliation":[]},{"given":"Dongzhan","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Wenlong","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jiezhang","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Xue","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Junzhi","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Songyang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Haodong","family":"Duan","sequence":"additional","affiliation":[]},{"given":"Guangtao","family":"Zhai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,18]]},"reference":[{"key":"4676_CR1","volume-title":"GPT-4o system card","author":"A Hurst","year":"2024","unstructured":"Hurst A, Lerer A, Goucher A P, et al. GPT-4o system card. 2024. ArXiv:2410.21276"},{"key":"4676_CR2","volume-title":"Gpt-4o mini: advancing cost-efficient intelligence","author":"OpenAI","year":"2024","unstructured":"OpenAI. Gpt-4o mini: advancing cost-efficient intelligence. 2024. https:\/\/openai.com\/index\/gpt-4o-mini-advancing-cost-efficient-intelligence\/"},{"key":"4676_CR3","volume-title":"Gemini 2.5: pushing the Frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities","author":"G Comanici","year":"2025","unstructured":"Comanici G, Bieber E, Schaekermann M, et al. Gemini 2.5: pushing the Frontier with advanced reasoning, multimodality, long context, and next generation agentic capabilities. 2025. ArXiv:2507.06261"},{"key":"4676_CR4","series-title":"Technical Report","volume-title":"Grok-3: The Age of Reasoning Agents","author":"xAI Team","year":"2025","unstructured":"xAI Team. Grok-3: The Age of Reasoning Agents. Technical Report, 2025. https:\/\/x.ai\/news\/grok-3"},{"key":"4676_CR5","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1016\/j.neucom.2018.05.080","volume":"311","author":"S Bai","year":"2018","unstructured":"Bai S, An S. A survey on automatic image caption generation. Neurocomputing, 2018, 311: 291\u2013304","journal-title":"Neurocomputing"},{"key":"4676_CR6","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1016\/j.cviu.2017.05.001","volume":"163","author":"Q Wu","year":"2017","unstructured":"Wu Q, Teney D, Wang P, et al. Visual question answering: a survey of methods and datasets. Comput Vision Image Understanding, 2017, 163: 21\u201340","journal-title":"Comput Vision Image Understanding"},{"key":"4676_CR7","unstructured":"Zhang C, Zhang C, Zhang M, et al. Text-to-image diffusion models in generative AI: a survey. 2023. ArXiv:2303.07909"},{"key":"4676_CR8","first-page":"32","volume-title":"Proceedings of the 4th International Conference on Artificial Intelligence, Robotics and Control (AIRC)","author":"A Singh","year":"2023","unstructured":"Singh A. A survey of AI text-to-image and AI text-to-video generators. In: Proceedings of the 4th International Conference on Artificial Intelligence, Robotics and Control (AIRC), 2023. 32\u201336"},{"key":"4676_CR9","unstructured":"Yang C, Lu C, Wang Y, et al. Towards ai-45\u00b0 law: a roadmap to trustworthy agi. 2024. ArXiv:2412.14186"},{"key":"4676_CR10","doi-asserted-by":"publisher","first-page":"103255","DOI":"10.1016\/j.displa.2025.103255","volume":"91","author":"Z Zhang","year":"2026","unstructured":"Zhang Z, Wang J, Guo Y, et al. AIBench: towards trustworthy evaluation under the 45\u00b0 law. Displays, 2026, 91: 103255","journal-title":"Displays"},{"key":"4676_CR11","first-page":"1","volume":"21","author":"Z Zhang","year":"2025","unstructured":"Zhang Z, Zhou Y, Li C, et al. Quality assessment in the era of large models: a survey. ACM Trans Multimedia Comput Commun Appl, 2025, 21: 1\u201331","journal-title":"ACM Trans Multimedia Comput Commun Appl"},{"key":"4676_CR12","volume-title":"Compassbench large language model leaderboard","author":"Opencompass","year":"2025","unstructured":"Opencompass. Compassbench large language model leaderboard. 2025. https:\/\/rank.opencompass.org.cn\/leaderboard-llm\/"},{"key":"4676_CR13","unstructured":"Zhang K, Li B, Zhang P, et al. Lmms-eval: reality check on the evaluation of large multimodal models. 2024. ArXiv:2407.12772"},{"key":"4676_CR14","unstructured":"Zhong W, Cui R, Guo Y, et al. Agieval: a human-centric benchmark for evaluating foundation models. 2023. ArXiv:2304.06364"},{"key":"4676_CR15","unstructured":"Fu C, Zhang Y F, Yin S, et al. Mme-survey: a comprehensive survey on evaluation of multimodal llms. 2024. ArXiv:2411.15296"},{"key":"4676_CR16","unstructured":"Li L, Chen G, Shi H, et al. A survey on multimodal benchmarks: In the era of large AI models. 2024. ArXiv:2409.18142"},{"key":"4676_CR17","unstructured":"Huang J, Zhang J. A survey on evaluation of multimodal large language models. 2024. ArXiv:2408.15769"},{"key":"4676_CR18","doi-asserted-by":"publisher","first-page":"211301","DOI":"10.1007\/s11432-024-4133-3","volume":"67","author":"X K Min","year":"2024","unstructured":"Min X K, Duan H Y, Sun W, et al. Perceptual video quality assessment: a survey. Sci China Inf Sci, 2024, 67: 211301","journal-title":"Sci China Inf Sci"},{"key":"4676_CR19","doi-asserted-by":"publisher","first-page":"211301","DOI":"10.1007\/s11432-019-2757-1","volume":"63","author":"G T Zhai","year":"2020","unstructured":"Zhai G T, Min X K. Perceptual image quality assessment: a survey. Sci China Inf Sci, 2020, 63: 211301","journal-title":"Sci China Inf Sci"},{"key":"4676_CR20","doi-asserted-by":"publisher","first-page":"220101","DOI":"10.1007\/s11432-024-4231-5","volume":"67","author":"Z Chen","year":"2024","unstructured":"Chen Z, Wang W Y, Tian H, et al. How far are we to GPT-4V? Closing the gap to commercial multimodal models with open-source suites. Sci China Inf Sci, 2024, 67: 220101","journal-title":"Sci China Inf Sci"},{"key":"4676_CR21","doi-asserted-by":"publisher","first-page":"220107","DOI":"10.1007\/s11432-024-4234-4","volume":"67","author":"Y Zhang","year":"2024","unstructured":"Zhang Y, Ji Z, Pang Y W, et al. Modality-experts coordinated adaptation for large multimodal models. Sci China Inf Sci, 2024, 67: 220107","journal-title":"Sci China Inf Sci"},{"key":"4676_CR22","doi-asserted-by":"publisher","first-page":"220103","DOI":"10.1007\/s11432-024-4187-3","volume":"67","author":"Y Z Liu","year":"2024","unstructured":"Liu Y Z, Cao Y, Gao Z W, et al. MMInstruct: a high-quality multi-modal instruction tuning dataset with extensive diversity. Sci China Inf Sci, 2024, 67: 220103","journal-title":"Sci China Inf Sci"},{"key":"4676_CR23","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu H, Li C, Wu Q, et al. Visual instruction tuning. In: Proceedings of Advances in Neural Information Processing Systems, 2023. 36: 34892\u201334916","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR24","unstructured":"Qian Y, Ye H, Fauconnier J P, et al. Mia-bench: towards better instruction following evaluation of multimodal LLMs. 2024. ArXiv:2407.01509"},{"key":"4676_CR25","unstructured":"Ding S, Wu S, Zhao X, et al. Mm-ifengine: towards multimodal instruction following. 2025. ArXiv:2504.07957"},{"key":"4676_CR26","unstructured":"Bitton Y, Bansal H, Hessel J, et al. Visit-bench: a benchmark for vision-language instruction following inspired by real-world use. 2023. ArXiv:2308.06595"},{"key":"4676_CR27","first-page":"8698","volume":"37","author":"Z Liu","year":"2024","unstructured":"Liu Z, Chu T, Zang Y, et al. Mmdu: a multi-turn multi-image dialog understanding benchmark and instruction-tuning dataset for LVLMS. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 8698\u20138733","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR28","doi-asserted-by":"crossref","unstructured":"Liu S, Ying K, Zhang H, et al. Convbench: a multi-turn conversation evaluation benchmark with hierarchical capability for large vision-language models. 2024. ArXiv:2403.20194","DOI":"10.52202\/079017-3195"},{"key":"4676_CR29","doi-asserted-by":"crossref","unstructured":"Kottur S, Moon S, Geramifard A, et al. Simmc 2.0: a task-oriented dialog dataset for immersive multimodal conversations. 2021. ArXiv:2104.08667","DOI":"10.18653\/v1\/2021.emnlp-main.401"},{"key":"4676_CR30","unstructured":"Wang X, Zhou Y, Liu X, et al. Mementos: a comprehensive benchmark for multimodal large language model reasoning over image sequences. 2024. ArXiv:2401.10529"},{"key":"4676_CR31","unstructured":"Wang F, Fu X, Huang J Y, et al. Muirbench: a comprehensive benchmark for robust multi-image understanding. 2024. ArXiv:2406.09411"},{"key":"4676_CR32","unstructured":"Meng F, Wang J, Li C, et al. Mmiu: multimodal multi-image understanding for evaluating large vision-language models. 2024. ArXiv:2408.02718"},{"key":"4676_CR33","unstructured":"Zhao B, Zong Y, Zhang L, et al. Benchmarking multi-image understanding in vision and language models: perception, knowledge, reasoning, and multi-hop reasoning. 2024. ArXiv:2406.12742"},{"key":"4676_CR34","doi-asserted-by":"crossref","unstructured":"Liu H, Zhang X, Xu H, et al. Mibench: evaluating multimodal large language models over multiple images. 2024. ArXiv:2407.15272","DOI":"10.18653\/v1\/2024.emnlp-main.1250"},{"key":"4676_CR35","first-page":"46378","volume":"37","author":"Z Liu","year":"2024","unstructured":"Liu Z, Fang F, Feng X, et al. Ii-bench: an image implication understanding benchmark for multimodal large language models. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 46378\u201346480","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR36","unstructured":"Jiang D, He X, Zeng H, et al. Mantis: interleaved multi-image instruction tuning. 2024. ArXiv:2405.01483"},{"key":"4676_CR37","unstructured":"Song D, Chen S, Chen G H, et al. Milebench: benchmarking MLLMs in long context. 2024. ArXiv:2404.18532"},{"key":"4676_CR38","first-page":"60088","volume":"37","author":"M Kazemi","year":"2024","unstructured":"Kazemi M, Dikkala N, Anand A, et al. Remi: a dataset for reasoning with multiple images. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 60088\u201360109","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR39","unstructured":"Luo F, Chen C, Wan Z, et al. Codis: benchmarking context-dependent visual comprehension for multimodal large language models. 2024. ArXiv:2402.13607"},{"key":"4676_CR40","unstructured":"Huang Y, Meng Z, Liu F, et al. Sparkles: unlocking chats across multiple images for multimodal instruction-following models. 2023. ArXiv:2308.16463"},{"key":"4676_CR41","unstructured":"Xia P, Han S, Qiu S, et al. Mmie: massive multimodal interleaved comprehension benchmark for large vision-language models. 2024. ArXiv:2410.10139"},{"key":"4676_CR42","unstructured":"Liu M, Xu Z, Lin Z, et al. Holistic evaluation for interleaved text-and-image generation. 2024. ArXiv:2406.14643"},{"key":"4676_CR43","first-page":"56","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"P Zhou","year":"2025","unstructured":"Zhou P, Peng X, Song J, et al. Opening: a comprehensive benchmark for judging open-ended interleaved image-text generation. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 56\u201366"},{"key":"4676_CR44","unstructured":"Raza S, Narayanan A, Khazaie V R, et al. Humanibench: a human-centric framework for large multimodal models evaluation. 2025. ArXiv:2505.11454"},{"key":"4676_CR45","unstructured":"Li K, Yang Z, Zhao J, et al. Herm: benchmarking and enhancing multimodal LLMs for human-centric understanding. 2024. ArXiv:2410.06777"},{"key":"4676_CR46","unstructured":"Zhou Z, Wang Q, Lin B, et al. Uniaa: a unified multi-modal image aesthetic assessment baseline and benchmark. 2024. ArXiv:2404.09619"},{"key":"4676_CR47","unstructured":"Liao Z, Liu X, Qin W, et al. Humanaesexpert: advancing a multi-modality foundation model for human image aesthetic assessment. 2025. ArXiv:2503.23907"},{"key":"4676_CR48","doi-asserted-by":"crossref","unstructured":"Sap M, Rashkin H, Chen D, et al. Socialiqa: commonsense reasoning about social interactions. 2019. ArXiv:1904.09728","DOI":"10.18653\/v1\/D19-1454"},{"key":"4676_CR49","doi-asserted-by":"crossref","unstructured":"Shen J, Kim Y, Hulse M, et al. Empathicstories++: a multimodal dataset for empathy towards personal experiences. 2024. ArXiv:2405.15708","DOI":"10.18653\/v1\/2024.findings-acl.268"},{"key":"4676_CR50","volume-title":"Proceedings of the 41st International Conference on Machine Learning","author":"W L Chiang","year":"2024","unstructured":"Chiang W L, Zheng L, Sheng Y, et al. Chatbot Arena: an open platform for evaluating LLMs by human preference. In: Proceedings of the 41st International Conference on Machine Learning, 2024"},{"key":"4676_CR51","first-page":"47669","volume":"36","author":"A K\u00f6pf","year":"2023","unstructured":"K\u00f6pf A, Kilcher Y, Von R\u00fctte D, et al. OpenAssistant conversations-democratizing large language model alignment. In: Proceedings of Advances in Neural Information Processing Systems, 2023. 36: 47669\u201347681","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR52","unstructured":"Guo Y, Ji K, Zhu X, et al. Human-centric evaluation for foundation models. 2025. ArXiv:2506.01793"},{"key":"4676_CR53","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TGRS.2024.3511622","volume":"62","author":"J Gao","year":"2024","unstructured":"Gao J, Zhao L, Li X. Nwpu-moc: a benchmark for fine-grained multicategory object counting in aerial images. IEEE Trans Geosci Remote Sensing, 2024, 62: 1\u201314","journal-title":"IEEE Trans Geosci Remote Sensing"},{"key":"4676_CR54","first-page":"8406","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"K Sun","year":"2025","unstructured":"Sun K, Huang K, Liu X, et al. T2v-compbench: a comprehensive benchmark for compositional text-to-video generation. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 8406\u20138416"},{"key":"4676_CR55","first-page":"86004","volume":"37","author":"X Wu","year":"2024","unstructured":"Wu X, Yu D, Huang Y, et al. Conceptmix: a compositional image generation benchmark with controllable difficulty. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 86004\u201386047","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR56","first-page":"14411","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"Z Zhao","year":"2025","unstructured":"Zhao Z, Lu P, Zhang A, et al. Can machines understand composition? Dataset and benchmark for photographic image composition embedding and understanding. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 14411\u201314421"},{"key":"4676_CR57","first-page":"8317","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"A Singh","year":"2019","unstructured":"Singh A, Natarajan V, Shah M, et al. Towards VQA models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019. 8317\u20138326"},{"key":"4676_CR58","doi-asserted-by":"publisher","first-page":"947","DOI":"10.1109\/ICDAR.2019.00156","volume-title":"Proceedings of 2019 International Conference on Document Analysis and Recognition (ICDAR)","author":"A Mishra","year":"2019","unstructured":"Mishra A, Shekhar S, Singh A K, et al. Ocr-vqa: visual question answering by reading text in images. In: Proceedings of 2019 International Conference on Document Analysis and Recognition (ICDAR), 2019. 947\u2013952"},{"key":"4676_CR59","doi-asserted-by":"publisher","first-page":"220102","DOI":"10.1007\/s11432-024-4235-6","volume":"67","author":"Y L Liu","year":"2024","unstructured":"Liu Y L, Li Z, Huang M X, et al. OCRBench: on the hidden mystery of OCR in large multimodal models. Sci China Inf Sci, 2024, 67: 220102","journal-title":"Sci China Inf Sci"},{"key":"4676_CR60","unstructured":"Fu L, Kuang Z, Song J, et al. Ocrbench v2: an improved benchmark for evaluating large multimodal models on visual text localization and reasoning. 2024. ArXiv:2501.00321"},{"key":"4676_CR61","unstructured":"Jia Q, Yue X, Huang S, et al. Visual perception in text strings. 2024. ArXiv:2410.01733"},{"key":"4676_CR62","unstructured":"Huang M, Shi Y, Peng D, et al. OCR-reasoning benchmark: unveiling the true capabilities of MLLMs in complex text-rich image reasoning. 2025. ArXiv:2505.17163"},{"key":"4676_CR63","first-page":"35549","volume":"35","author":"M Zhao","year":"2022","unstructured":"Zhao M, Li B, Wang J, et al. Towards video text visual question answering: benchmark and baseline. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 35: 35549\u201335562","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR64","unstructured":"Li B, Ge Y, Chen Y, et al. Seed-bench-2-plus: benchmarking multimodal large language models with text-rich visual comprehension. 2024. ArXiv:2404.16790"},{"key":"4676_CR65","doi-asserted-by":"publisher","first-page":"220106","DOI":"10.1007\/s11432-024-4250-y","volume":"67","author":"H Feng","year":"2024","unstructured":"Feng H, Liu Q, Liu H, et al. DocPedia: unleashing the power of large multimodal model in the frequency domain for versatile document understanding. Sci China Inf Sci, 2024, 67: 220106","journal-title":"Sci China Inf Sci"},{"key":"4676_CR66","unstructured":"Zhu F, Liu Z, Ng X Y, et al. Mmdocbench: benchmarking large vision-language models for fine-grained visual document understanding. 2024. ArXiv:2410.21311"},{"key":"4676_CR67","first-page":"95963","volume":"37","author":"Y Ma","year":"2024","unstructured":"Ma Y, Zang Y, Chen L, et al. Mmlongbench-doc: benchmarking long-context document understanding with visualizations. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 95963\u201396010","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR68","doi-asserted-by":"publisher","first-page":"67200","DOI":"10.52202\/079017-2145","volume":"37","author":"Y Hui","year":"2024","unstructured":"Hui Y, Lu Y, Zhang H. Uda: a benchmark suite for retrieval augmented generation in real-world document analysis. In: Proceedings of Advances in Neural Information Processing Systems, 2024, 37: 67200\u201367217","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR69","first-page":"13878","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"R Tanaka","year":"2021","unstructured":"Tanaka R, Nishida K, Yoshida S. Visualmrc: machine reading comprehension on document images. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2021. 13878\u201313888"},{"key":"4676_CR70","first-page":"2200","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"M Mathew","year":"2021","unstructured":"Mathew M, Karatzas D, Jawahar C. Docvqa: a dataset for vqa on document images. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2021. 2200\u20132209"},{"key":"4676_CR71","unstructured":"Xia R, Mao S, Yan X, et al. Docgenome: an open large-scale scientific document benchmark for training and testing multi-modal large language models. 2024. ArXiv:2406.11633"},{"key":"4676_CR72","unstructured":"Li S, Shen Y, Chen X, et al. Gdi-bench: a benchmark for general document intelligence with vision and reasoning decoupling. 2025. ArXiv:2505.00063"},{"key":"4676_CR73","first-page":"59708","volume":"36","author":"C Rawles","year":"2023","unstructured":"Rawles C, Li A, Rodriguez D, et al. Androidinthewild: a large-scale dataset for Android device control. In: Proceedings of Advances in Neural Information Processing Systems, 2023. 36: 59708\u201359728","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR74","first-page":"9313","volume-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics","author":"K Cheng","year":"2024","unstructured":"Cheng K, Sun Q, Chu Y, et al. Seeclick: harnessing GUI grounding for advanced visual GUI agents. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, 2024. 9313\u20139332"},{"key":"4676_CR75","unstructured":"Liu J, Song Y, Lin B Y, et al. Visualwebbench: how far have multimodal LLMs evolved in web page understanding and grounding? 2024. ArXiv:2404.05955"},{"key":"4676_CR76","unstructured":"Chen D, Huang Y, Wu S, et al. Gui-world: a video benchmark and dataset for multimodal GUI-oriented understanding. 2024. ArXiv:2406.10819"},{"key":"4676_CR77","doi-asserted-by":"crossref","unstructured":"Lin Z, Zhou Z, Zhao Z, et al. Webuibench: a comprehensive benchmark for evaluating multimodal large language models in webui-to-code. 2025. ArXiv:2506.07818","DOI":"10.18653\/v1\/2025.findings-acl.815"},{"key":"4676_CR78","volume-title":"Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"Y C Hsiao","year":"2025","unstructured":"Hsiao Y C, Zubach F, Baechler G, et al. Screenqa: large-scale question-answer pairs over mobile app screenshots. In: Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies, 2025"},{"key":"4676_CR79","first-page":"2263","volume-title":"Proceedings of Findings of the Association for Computational Linguistics","author":"A Masry","year":"2022","unstructured":"Masry A, Do X L, Tan J Q, et al. ChartQA: a benchmark for question answering about charts with visual and logical reasoning. In: Proceedings of Findings of the Association for Computational Linguistics, 2022. 2263\u20132279"},{"key":"4676_CR80","doi-asserted-by":"crossref","unstructured":"Masry A, Islam M S, Ahmed M, et al. Chartqapro: a more diverse and challenging benchmark for chart question answering. 2025. ArXiv:2504.05506","DOI":"10.18653\/v1\/2025.findings-acl.978"},{"key":"4676_CR81","first-page":"7185","volume":"37","author":"W Zhao","year":"2024","unstructured":"Zhao W, Feng H, Liu Q, et al. Tabpedia: towards comprehensive visual table understanding with concept synergy. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 7185\u20137212","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR82","unstructured":"Kim Y, Yim M, Song K Y. Tablevqa-bench: a visual question answering benchmark on multiple table domains. 2024. ArXiv:2404.19205"},{"key":"4676_CR83","first-page":"113569","volume":"37","author":"Z Wang","year":"2024","unstructured":"Wang Z, Xia M, He L, et al. Charting gaps in realistic chart understanding in multimodal LLMs. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 113569\u2013113697","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR84","first-page":"18695","volume":"37","author":"J Roberts","year":"2024","unstructured":"Roberts J, Han K, Houlsby N, et al. Scifibench: benchmarking large multimodal models for scientific figure interpretation. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 18695\u201318728","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR85","doi-asserted-by":"publisher","first-page":"661","DOI":"10.1007\/s10579-020-09517-1","volume":"55","author":"T Hiippala","year":"2021","unstructured":"Hiippala T, Alikhani M, Haverinen J, et al. AI2D-RST: a multimodal corpus of 1000 primary school science diagrams. Lang Resour Evaluation, 2021, 55: 661\u2013688","journal-title":"Lang Resour Evaluation"},{"key":"4676_CR86","unstructured":"Lin M, Xie T, Liu M, et al. Infochartqa: a benchmark for multimodal question answering on infographic charts. 2025. ArXiv:2505.19028"},{"key":"4676_CR87","first-page":"3680","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"M Huang","year":"2025","unstructured":"Huang M, Lai H, Zhang X, et al. Evochart: a benchmark and a self-training approach towards real-world chart understanding. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2025. 3680\u20133688"},{"key":"4676_CR88","doi-asserted-by":"crossref","unstructured":"Foroutan N, Romanou A, Ansaripour M, et al. Wikimixqa: a multimodal benchmark for question answering over tables and charts. 2025. ArXiv:2506.15594","DOI":"10.18653\/v1\/2025.findings-acl.1280"},{"key":"4676_CR89","unstructured":"Xia R, Zhang B, Ye H, et al. Chartx & chartvlm: a versatile benchmark and foundation model for complicated chart reasoning. 2024. ArXiv:2402.12185"},{"key":"4676_CR90","unstructured":"Wu H, Zhang Z, Zhang E, et al. Q-bench: a benchmark for general-purpose foundation models on low-level vision. 2023. ArXiv:2309.14181"},{"key":"4676_CR91","unstructured":"Zhang Z, Wu H, Li C, et al. A-bench: are lmms masters at evaluating AI-generated images? 2024. ArXiv:2406.03070"},{"key":"4676_CR92","doi-asserted-by":"crossref","unstructured":"Li G, Xie Y, Kan M Y. MVP-bench: can large vision\u2013language models conduct multi-level visual perception like humans? 2024. ArXiv:2410.04345","DOI":"10.18653\/v1\/2024.findings-emnlp.789"},{"key":"4676_CR93","first-page":"14325","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"F Wang","year":"2025","unstructured":"Wang F, Wang H, Guo Z, et al. Xlrs-bench: could your multimodal LLMs understand extremely large ultra-high-resolution remote sensing imagery? In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 14325\u201314336"},{"key":"4676_CR94","first-page":"7907","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"W Wang","year":"2025","unstructured":"Wang W, Ding L, Zeng M, et al. Divide, conquer and combine: a training-free framework for high-resolution image perception in multimodal large language models. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2025. 7907\u20137915"},{"key":"4676_CR95","unstructured":"Zhang Y F, Zhang H, Tian H, et al. Mme-realworld: Could your multimodal LLM challenge high-resolution real-world scenarios that are difficult for humans? 2024. ArXiv:2408.13257"},{"key":"4676_CR96","first-page":"13084","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"P Wu","year":"2024","unstructured":"Wu P, Xie S. V*: guided visual search as a core mechanism in multimodal LLMs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 13084\u201313094"},{"key":"4676_CR97","first-page":"9154","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"X Wang","year":"2025","unstructured":"Wang X, Ma X, Hou X, et al. Facebench: a multi-view multi-level facial attribute VQA dataset for benchmarking face perception MLLMs. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 9154\u20139164"},{"key":"4676_CR98","unstructured":"Liu Z, Qian L, Xie Q, et al. Mmaffben: a multilingual and multimodal affective analysis benchmark for evaluating LLMs and VLMS. 2025. ArXiv:2505.24423"},{"key":"4676_CR99","first-page":"165","volume-title":"Proceedings of European Conference on Computer Vision","author":"Y Li","year":"2024","unstructured":"Li Y, Dao A, Bao W, et al. Facial affective behavior analysis with instruction tuning. In: Proceedings of European Conference on Computer Vision, 2024. 165\u2013186"},{"key":"4676_CR100","unstructured":"Zhou Y, Zhang Z, Cao J, et al. Memo-bench: a multiple benchmark for text-to-image and multimodal large language models on human emotion analysis. 2024. ArXiv:2411.11235"},{"key":"4676_CR101","unstructured":"Sabour S, Liu S, Zhang Z, et al. Emobench: evaluating the emotional intelligence of large language models. 2024. ArXiv:2402.12071"},{"key":"4676_CR102","doi-asserted-by":"crossref","unstructured":"Gao L, Jia Z, Zeng Y, et al. Eemo-bench: a benchmark for multi-modal large language models on image evoked emotion assessment. 2025. ArXiv:2504.16405","DOI":"10.1145\/3746027.3755777"},{"key":"4676_CR103","unstructured":"Huang Y, Yuan Q, Sheng X, et al. Aesbench: an expert benchmark for multimodal large language models on image aesthetics perception. 2024. ArXiv:2401.08276"},{"key":"4676_CR104","doi-asserted-by":"crossref","unstructured":"Zou H P, Samuel V, Zhou Y, et al. Implicitave: an open-source dataset and multimodal LLMs benchmark for implicit attribute value extraction. 2024. ArXiv:2404.15592","DOI":"10.18653\/v1\/2024.findings-acl.20"},{"key":"4676_CR105","first-page":"6392","volume-title":"Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"X Song","year":"2025","unstructured":"Song X, Wu M, Zhu K Q, et al. A cognitive evaluation benchmark of image reasoning and description for large vision-language models. In: Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies, 2025. 6392\u20136409"},{"key":"4676_CR106","unstructured":"Wang J, Li W, Wu Y, et al. Affordance benchmark for MLLMs. 2025. ArXiv:2506.00893"},{"key":"4676_CR107","unstructured":"Wang Y, Liao Y, Liu H, et al. Mm-sap: a comprehensive benchmark for assessing self-awareness of multimodal large language models in perception. 2024. ArXiv:2401.07529"},{"key":"4676_CR108","first-page":"87310","volume":"37","author":"P Tong","year":"2024","unstructured":"Tong P, Brown E, Wu P, et al. Cambrian-1: a fully open, vision-centric exploration of multimodal LLMs. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 87310\u201387356","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR109","first-page":"35414","volume":"37","author":"J Li","year":"2024","unstructured":"Li J, Wei Q, Zhang C, et al. Single image unlearning: efficient machine unlearning in multimodal large language models. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 35414\u201335453","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR110","first-page":"9568","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"S Tong","year":"2024","unstructured":"Tong S, Liu Z, Zhai Y, et al. Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 9568\u20139578"},{"key":"4676_CR111","doi-asserted-by":"crossref","unstructured":"Wu H, Zhu H, Zhang Z, et al. Towards open-ended visual quality comparison. 2024. ArXiv:2402.16641","DOI":"10.1007\/978-3-031-72646-0_21"},{"key":"4676_CR112","first-page":"69","volume-title":"Proceedings of European Conference on Computer Vision","author":"L Yu","year":"2016","unstructured":"Yu L, Poirson P, Yang S, et al. Modeling context in referring expressions. In: Proceedings of European Conference on Computer Vision, 2016. 69\u201385"},{"key":"4676_CR113","first-page":"11","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"J Mao","year":"2016","unstructured":"Mao J, Huang J, Toshev A, et al. Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016. 11\u201320"},{"key":"4676_CR114","first-page":"513","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"J Chen","year":"2025","unstructured":"Chen J, Wei F, Zhao J, et al. Revisiting referring expression comprehension evaluation in the era of large multimodal models. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 513\u2013524"},{"key":"4676_CR115","first-page":"12998","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"W Wang","year":"2024","unstructured":"Wang W, Yue T, Zhang Y, et al. Unveiling parts beyond objects: towards finer-granularity referring expression segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 12998\u201313008"},{"key":"4676_CR116","first-page":"10707","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"B Zhou","year":"2025","unstructured":"Zhou B, Yang H, Chen D, et al. Urbench: a comprehensive benchmark for evaluating large multimodal models in multi-view urban scenarios. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2025. 10707\u201310715"},{"key":"4676_CR117","first-page":"9186","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"J Li","year":"2025","unstructured":"Li J, Zhang X, Zou H, et al. Counts: benchmarking object detectors and multimodal large language models under distribution shifts. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 9186\u20139198"},{"key":"4676_CR118","unstructured":"Tang J, Liu Q, Ye Y, et al. Mtvqa: benchmarking multilingual text-centric visual question answering, 2024. ArXiv:2405.11985"},{"key":"4676_CR119","unstructured":"Xing S, Xiang C, Han Y, et al. Gepbench: evaluating fundamental geometric perception for multimodal large language models. 2024. ArXiv:2412.21036"},{"key":"4676_CR120","unstructured":"Liu J, Liu Z, Cen Z, et al. Can multimodal large language models understand spatial relations? 2025. ArXiv:2505.19015"},{"key":"4676_CR121","doi-asserted-by":"publisher","first-page":"135062","DOI":"10.52202\/079017-4293","volume":"37","author":"A C Cheng","year":"2024","unstructured":"Cheng A C, Yin H, Fu Y, et al. Spatialrgpt: grounded spatial reasoning in vision-language models. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 135062\u2013135093","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR122","first-page":"29569","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"Y Zhu","year":"2025","unstructured":"Zhu Y, Wang Z, Zhang C, et al. Cospace: benchmarking continuous space perception ability for vision-language models. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 29569\u201329579"},{"key":"4676_CR123","unstructured":"Kil J, Mai Z, Lee J, et al. MLLM-Compbench: a comparative reasoning benchmark for multimodal LLMs. 2024. ArXiv:2407.16837"},{"key":"4676_CR124","first-page":"13384","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"A Wang","year":"2024","unstructured":"Wang A, Wu B, Chen S, et al. Sok-bench: a situated video reasoning benchmark with aligned open-world knowledge. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 13384\u201313394"},{"key":"4676_CR125","unstructured":"Rajabi N, Kosecka J. Gsr-bench: a benchmark for grounded spatial reasoning evaluation via multimodal LLMs. 2024. ArXiv:2406.13246"},{"key":"4676_CR126","doi-asserted-by":"crossref","unstructured":"Kamath A, Hessel J, Chang K W. What\u2019s \u201cup\u201d with vision-language models? Investigating their struggle with spatial reasoning. 2023. ArXiv:2310.19785","DOI":"10.18653\/v1\/2023.emnlp-main.568"},{"key":"4676_CR127","doi-asserted-by":"crossref","unstructured":"Liao Y H, Mahmood R, Fidler S, et al. Reasoning paths with reference objects elicit quantitative spatial reasoning in large vision-language models, 2024. ArXiv:2409.09788","DOI":"10.18653\/v1\/2024.emnlp-main.947"},{"key":"4676_CR128","doi-asserted-by":"crossref","unstructured":"Wang W, Ren Y, Luo H, et al. The all-seeing project v2: towards general relation comprehension of the open world, 2024. ArXiv:2402.19474","DOI":"10.1007\/978-3-031-73414-4_27"},{"key":"4676_CR129","first-page":"8612","volume":"37","author":"H Shao","year":"2024","unstructured":"Shao H, Qian S, Xiao H, et al. Visual cot: advancing multi-modal language models with a comprehensive dataset and benchmark for chain-of-thought reasoning. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 8612\u20138642","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR130","unstructured":"Xiao Y, Sun E, Liu T, et al. Logicvista: multimodal LLM logical reasoning benchmark in visual contexts. 2024. ArXiv:2407.04973"},{"key":"4676_CR131","unstructured":"Xu W, Wang J, Wang W, et al. Visulogic: a benchmark for evaluating visual reasoning in multi-modal large language models. 2025. ArXiv:2504.15279"},{"key":"4676_CR132","first-page":"23678","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"Z Cheng","year":"2025","unstructured":"Cheng Z, Chen Q, Zhang J, et al. Comt: a novel benchmark for chain of multi-modal thought on large vision-language models. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2025. 23678\u201323686"},{"key":"4676_CR133","doi-asserted-by":"publisher","first-page":"127059","DOI":"10.52202\/079017-4035","volume":"37","author":"B Estermann","year":"2024","unstructured":"Estermann B, Lanzend\u00f6rfer L, Niedermayr Y, et al. Puzzles: a benchmark for neural algorithmic reasoning. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 127059\u2013127098","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR134","first-page":"115146","volume":"37","author":"H H Zhao","year":"2024","unstructured":"Zhao H H, Zhou P, Gao D, et al. Lova3: learning to visual question answering, asking and assessment. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 115146\u2013115175","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR135","doi-asserted-by":"publisher","first-page":"9257","DOI":"10.52202\/079017-0294","volume":"37","author":"H Huang","year":"2024","unstructured":"Huang H, Zhong H, Yu T, et al. Vlkeb: a large vision-language model knowledge editing benchmark. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 9257\u20139280","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR136","unstructured":"Du Y, Jiang K, Gao Z, et al. Mmke-bench: a multimodal editing benchmark for diverse visual knowledge. 2025. ArXiv:2502.19870"},{"key":"4676_CR137","doi-asserted-by":"crossref","unstructured":"Zhang J, Zhang H, Yin X, et al. Mc-mke: a fine-grained multimodal knowledge editing benchmark emphasizing modality consistency. 2024. ArXiv:2406.13219","DOI":"10.18653\/v1\/2025.findings-acl.896"},{"key":"4676_CR138","doi-asserted-by":"crossref","unstructured":"Li J, Du M, Zhang C, et al. Mike: a new benchmark for fine-grained multimodal entity knowledge editing. 2024. ArXiv:2402.14835","DOI":"10.18653\/v1\/2024.findings-acl.298"},{"key":"4676_CR139","doi-asserted-by":"crossref","unstructured":"Zhang Y, Su Y, Liu Y, et al. Negvqa: can vision language models understand negation? 2025. ArXiv:2505.22946","DOI":"10.18653\/v1\/2025.findings-acl.191"},{"key":"4676_CR140","unstructured":"Xu P, Shao W, Zhang K, et al. Lvlm-ehub: a comprehensive evaluation benchmark for large vision-language models. 2023. ArXiv:2306.09265"},{"key":"4676_CR141","unstructured":"Shao W, Lei M, Hu Y, et al. Tinylvlm-ehub: towards comprehensive and efficient evaluation for large vision-language models. 2024. ArXiv:2308.03729"},{"key":"4676_CR142","unstructured":"Yin Z, Wang J, Cao J, et al. Lamm: language-assisted multi-modal instruction-tuning dataset, framework, and benchmark. 2024. ArXiv:2306.06687"},{"key":"4676_CR143","unstructured":"Fu C, Chen P, Shen Y, et al. Mme: a comprehensive evaluation benchmark for multimodal large language models. 2024. ArXiv:2306.13394"},{"key":"4676_CR144","unstructured":"Liu Y, Duan H, Zhang Y, et al. Mmbench: is your multi-modal model an all-around player? 2023. ArXiv:2307.06281"},{"key":"4676_CR145","unstructured":"Li B, Wang R, Wang G, et al. Seed-bench: benchmarking multimodal LLMs with generative comprehension. 2023. ArXiv:2307.16125"},{"key":"4676_CR146","first-page":"13299","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"B Li","year":"2024","unstructured":"Li B, Ge Y, Ge Y, et al. Seed-bench-2: benchmarking multimodal large language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024. 13299\u201313308"},{"key":"4676_CR147","unstructured":"Ying K, Meng F, Wang J, et al. Mmt-bench: a comprehensive multimodal benchmark for evaluating large vision-language models towards multitask agi. 2024. ArXiv:2404.16006"},{"key":"4676_CR148","volume-title":"Lmms-eval: accelerating the development of large multimodal models","author":"B Li","year":"2024","unstructured":"Li B, Zhang P, Zhang K, et al. Lmms-eval: accelerating the development of large multimodal models. Version v0.1.0. 2024. https:\/\/github.com\/EvolvingLMMs-Lab\/lmms-eval"},{"key":"4676_CR149","unstructured":"Chen L, Li J, Dong X, et al. Are we on the right way for evaluating large vision-language models? 2024. ArXiv:2403.20330"},{"key":"4676_CR150","unstructured":"Li B, Lin Z, Peng W, et al. Naturalbench: evaluating vision-language models on natural adversarial samples, 2025. ArXiv:2410.14669"},{"key":"4676_CR151","unstructured":"Yu W, Yang Z, Li L, et al. Mm-vet: evaluating large multimodal models for integrated capabilities. 2024. ArXiv:2308.02490"},{"key":"4676_CR152","unstructured":"Shi Z, Wang Z, Fan H, et al. Chef: a comprehensive evaluation framework for standardized assessment of multimodal large language models. 2023. ArXiv:2311.02692"},{"key":"4676_CR153","doi-asserted-by":"crossref","unstructured":"Fu C, Dai Y, Luo Y, et al. Video-mme: the first-ever comprehensive evaluation benchmark of multi-modal LLMs in video analysis. 2025. ArXiv:2405.21075","DOI":"10.1109\/CVPR52734.2025.02245"},{"key":"4676_CR154","unstructured":"Fang X, Mao K, Duan H, et al. Mmbench-video: a long-form multi-shot benchmark for holistic video understanding. 2024. ArXiv:2406.14515"},{"key":"4676_CR155","doi-asserted-by":"crossref","unstructured":"Li K, Wang Y, He Y, et al. Mvbench: a comprehensive multi-modal video understanding benchmark. 2024. ArXiv:2311.17005","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"4676_CR156","unstructured":"Wu H, Li D, Chen B, et al. Longvideobench: a benchmark for long-context interleaved video-language understanding. 2024. ArXiv:2407.15754"},{"key":"4676_CR157","unstructured":"Wang W, He Z, Hong W, et al. Lvbench: an extreme long video understanding benchmark. 2025. ArXiv:2406.08035"},{"key":"4676_CR158","doi-asserted-by":"crossref","unstructured":"Hong W, Cheng Y, Yang Z, et al. Motionbench: benchmarking and improving fine-grained video motion understanding for vision language models. 2025. ArXiv:2501.02955","DOI":"10.1109\/CVPR52734.2025.00791"},{"key":"4676_CR159","unstructured":"Wang B, Zou X, Lin G, et al. Audiobench: a universal benchmark for audio large language models. 2025. ArXiv:2406.16020"},{"key":"4676_CR160","unstructured":"Yang Q, Xu J, Liu W, et al. Air-bench: benchmarking large audio-language models via generative comprehension. 2024. ArXiv:2402.07729"},{"key":"4676_CR161","doi-asserted-by":"crossref","unstructured":"yu Huang C, Lu K H, Wang S H, et al. Dynamic-superb: towards a dynamic, collaborative, and comprehensive instruction-tuning benchmark for speech. 2024. ArXiv:2309.09510","DOI":"10.1109\/ICASSP48485.2024.10448257"},{"key":"4676_CR162","unstructured":"Li M, Chen X, Zhang C, et al. M3dbench: let\u2019s instruct large models with multi-modal 3d prompts. 2023. ArXiv:2312.10763"},{"key":"4676_CR163","unstructured":"Bai F, Du Y, Huang T, et al. M3d: advancing 3d medical image analysis with multi-modal large language models. 2024. ArXiv:2404.00578"},{"key":"4676_CR164","doi-asserted-by":"crossref","unstructured":"Szymanska E, Dusmanu M, Buurlage J W, et al. Space3d-bench: spatial 3d question answering benchmark. 2024. ArXiv:2408.16662","DOI":"10.1007\/978-3-031-91989-3_5"},{"key":"4676_CR165","first-page":"2507","volume":"35","author":"P Lu","year":"2022","unstructured":"Lu P, Mishra S, Xia T, et al. Learn to explain: multimodal reasoning via thought chains for science question answering. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 35: 2507\u20132521","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR166","unstructured":"He Z, Wu X, Zhou P, et al. Cmmu: a benchmark for Chinese multi-modal multi-type question understanding and reasoning. 2024. ArXiv:2401.14011"},{"key":"4676_CR167","unstructured":"Wang X, Hu Z, Lu P, et al. Scibench: evaluating college-level scientific problem-solving abilities of large language models. 2023. ArXiv:2307.10635"},{"key":"4676_CR168","doi-asserted-by":"crossref","unstructured":"Das R J, Hristov S E, Li H, et al. Exams-v: a multi-discipline multilingual multimodal exam benchmark for evaluating vision language models. 2024. ArXiv:2403.10378","DOI":"10.18653\/v1\/2024.acl-long.420"},{"key":"4676_CR169","first-page":"9556","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"X Yue","year":"2024","unstructured":"Yue X, Ni Y, Zhang K, et al. Mmmu: a massive multi-discipline multimodal understanding and reasoning benchmark for expert AGI. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 9556\u20139567"},{"key":"4676_CR170","unstructured":"Yue X, Zheng T, Ni Y, et al. Mmmu-pro: a more robust multi-discipline multimodal understanding benchmark. 2024. ArXiv:2409.02813"},{"key":"4676_CR171","doi-asserted-by":"crossref","unstructured":"Phan L, Gatti A, Han Z, et al. Humanity\u2019s last exam. 2025. ArXiv:2501.14249","DOI":"10.21203\/rs.3.rs-6615297\/v1"},{"key":"4676_CR172","unstructured":"Cui H, Shamsi Z, Cheon G, et al. Curie: evaluating LLMs on multitask scientific long context understanding and reasoning. 2025. ArXiv:2503.13517"},{"key":"4676_CR173","unstructured":"Zhou Y, Wang Y, He X, et al. Scientists\u2019 first exam: probing cognitive abilities of MLLM via perception, understanding, and reasoning. 2025. ArXiv:2506.10521"},{"key":"4676_CR174","doi-asserted-by":"crossref","unstructured":"Zhou P, Zhang F, Peng X, et al. Mdk12-bench: a multi-discipline benchmark for evaluating reasoning in multimodal large language models. 2025. ArXiv:2504.05782","DOI":"10.2139\/ssrn.5336635"},{"key":"4676_CR175","unstructured":"Wang J, Zhang Z, Guo Y, et al. The ever-evolving science exam. 2025. ArXiv:2507.16514"},{"key":"4676_CR176","unstructured":"Sun R, Chang J, Pearce H, et al. Sok: unifying cybersecurity and cybersafety of multimodal foundation models with an information theory approach. 2024. ArXiv:2411.11195"},{"key":"4676_CR177","first-page":"1","volume":"58","author":"C Zhang","year":"2026","unstructured":"Zhang C, Zhou L, Xu X, et al. Adversarial attacks of vision tasks in the past 10 years: a survey. ACM Comput Surv, 2026, 58: 1\u201342","journal-title":"ACM Comput Surv"},{"key":"4676_CR178","unstructured":"Liu X, Cui X, Li P, et al. Jailbreak attacks and defenses against multimodal generative models: a survey. 2024. ArXiv:2411.09259"},{"key":"4676_CR179","unstructured":"Ye M, Rong X, Huang W, et al. A survey of safety on large vision-language models: attacks, defenses and evaluations. 2025. ArXiv:2502.14881"},{"key":"4676_CR180","first-page":"21001","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"J Wang","year":"2025","unstructured":"Wang J, Zhang H, Yuan Y. Adv-cpg: a customized portrait generation framework with facial adversarial attacks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2025. 21001\u201321010"},{"key":"4676_CR181","doi-asserted-by":"publisher","first-page":"220105","DOI":"10.1007\/s11432-024-4251-x","volume":"67","author":"S K Yin","year":"2024","unstructured":"Yin S K, Fu C Y, Zhao S R, et al. Woodpecker: hallucination correction for multimodal large language models. Sci China Inf Sci, 2024, 67: 220105","journal-title":"Sci China Inf Sci"},{"key":"4676_CR182","unstructured":"Tu H, Cui C, Wang Z, et al. How many unicorns are in this image? A safety evaluation benchmark for vision LLMs. 2023. ArXiv:2311.16101"},{"key":"4676_CR183","unstructured":"Luo W, Ma S, Liu X, et al. Jailbreakv-28k: a benchmark for assessing the robustness of multimodal large language models against jailbreak attacks. 2024. ArXiv:2404.03027"},{"key":"4676_CR184","unstructured":"Liu X, Zhu Y, Lan Y, et al. Query-relevant images jailbreak large multi-modal models. 2023. ArXiv:2311.17600"},{"key":"4676_CR185","doi-asserted-by":"publisher","first-page":"1434","DOI":"10.1109\/TIFS.2024.3520306","volume":"20","author":"H Zhang","year":"2025","unstructured":"Zhang H, Shao W, Liu H, et al. B-AVIBench: toward evaluating the robustness of large vision-language model on black-box adversarial visual-instructions. IEEE Trans Inform Forensic Secur, 2025, 20: 1434\u20131446","journal-title":"IEEE Trans Inform Forensic Secur"},{"key":"4676_CR186","unstructured":"Weng F, Xu Y, Fu C, et al. MMJ-Bench: a comprehensive study on jailbreak attacks and defenses for vision language models. 2024. ArXiv:2408.08464"},{"key":"4676_CR187","unstructured":"Zheng B, Chen G, Zhong H, et al. Usb: a comprehensive and unified safety evaluation benchmark for multimodal large language models. 2025. ArXiv:2505.23793"},{"key":"4676_CR188","doi-asserted-by":"publisher","first-page":"7256","DOI":"10.52202\/079017-0232","volume":"37","author":"T Gu","year":"2024","unstructured":"Gu T, Zhou Z, Huang K, et al. Mllmguard: a multi-dimensional safety evaluation suite for multimodal large language models. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 7256\u20137295","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR189","unstructured":"Ying Z, Liu A, Liang S, et al. Safebench: a safety evaluation framework for multimodal large language models. 2024. ArXiv:2410.18927"},{"key":"4676_CR190","doi-asserted-by":"crossref","unstructured":"Lee D, Jang J, Jeong J, et al. Are vision-language models safe in the wild? A meme-based benchmark study. 2025. ArXiv:2505.15389","DOI":"10.18653\/v1\/2025.emnlp-main.1555"},{"key":"4676_CR191","unstructured":"Qu Y, Shen X, Wu Y, et al. Unsafebench: benchmarking image safety classifiers on real-world and AI-generated images. 2024. ArXiv:2405.03486"},{"key":"4676_CR192","volume-title":"Proceedings of the Conference on Empirical Methods in Natural Language Processing","author":"Y Li","year":"2023","unstructured":"Li Y, Du Y, Zhou K, et al. Evaluating object hallucination in large vision-language models. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, 2023"},{"key":"4676_CR193","first-page":"18135","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"A Gunjal","year":"2024","unstructured":"Gunjal A, Yin J, Bas E. Detecting and preventing hallucinations in large vision language models. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2024. 18135\u201318143"},{"key":"4676_CR194","doi-asserted-by":"publisher","first-page":"525","DOI":"10.1145\/3664647.3680576","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia","author":"C Jiang","year":"2024","unstructured":"Jiang C, Jia H, Dong M, et al. Hal-eval: a universal and fine-grained hallucination evaluation framework for large vision language models. In: Proceedings of the 32nd ACM International Conference on Multimedia, 2024. 525\u2013534"},{"key":"4676_CR195","doi-asserted-by":"publisher","first-page":"10707","DOI":"10.1145\/3664647.3681251","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia","author":"P Ding","year":"2024","unstructured":"Ding P, Wu J, Kuang J, et al. Hallu-pi: evaluating hallucination in multi-modal large language models within perturbed inputs. In: Proceedings of the 32nd ACM International Conference on Multimedia, 2024. 10707\u201310715"},{"key":"4676_CR196","first-page":"232","volume-title":"Proceedings of European Conference on Computer Vision","author":"M Ye-Bin","year":"2025","unstructured":"Ye-Bin M, Hyeon-Woo N, Choi W, et al. Beaf: observing before-after changes to evaluate hallucination in vision-language models. In: Proceedings of European Conference on Computer Vision, 2025. 232\u2013248"},{"key":"4676_CR197","first-page":"14375","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"T Guan","year":"2024","unstructured":"Guan T, Liu F, Wu X, et al. Hallusionbench: an advanced diagnostic suite for entangled language hallucination and visual illusion in large vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 14375\u201314385"},{"key":"4676_CR198","doi-asserted-by":"crossref","unstructured":"Wu X, Guan T, Li D, et al. Autohallusion: automatic generation of hallucination benchmarks for vision-language models. 2024. ArXiv:2406.10900","DOI":"10.18653\/v1\/2024.findings-emnlp.493"},{"key":"4676_CR199","unstructured":"Zhang Y, Huang Y, Sun Y, et al. Benchmarking trustworthiness of multimodal large language models: a comprehensive study. 2024. ArXiv:2406.07057"},{"key":"4676_CR200","unstructured":"Xu C, Zhang J, Chen Z, et al. Mmdt: decoding the trustworthiness and safety of multimodal foundation models. 2025. ArXiv:2503.14827"},{"key":"4676_CR201","doi-asserted-by":"publisher","first-page":"5769","DOI":"10.18653\/v1\/2024.emnlp-main.329","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing","author":"S Nayak","year":"2024","unstructured":"Nayak S, Jain K, Awal R, et al. Benchmarking vision language models for cultural understanding. In: Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, 2024. 5769\u20135790"},{"key":"4676_CR202","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing","author":"Y Jiang","year":"2024","unstructured":"Jiang Y, Li Z, Shen X, et al. Modscan: measuring stereotypical bias in large vision-language models from vision and language modalities. In: Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, 2024"},{"key":"4676_CR203","unstructured":"Wu P, Liu C, Chen C, et al. Fmbench: benchmarking fairness in multimodal large language models on medical tasks. 2024. ArXiv:2410.01089"},{"key":"4676_CR204","volume-title":"Proceedings of the 38th Conference on Neural Information Processing Systems","author":"R Jin","year":"2024","unstructured":"Jin R, Xu Z, Zhong Y, et al. Fairmedfm: fairness benchmarking for medical imaging foundation models. In: Proceedings of the 38th Conference on Neural Information Processing Systems, 2024"},{"key":"4676_CR205","first-page":"12289","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Luo","year":"2024","unstructured":"Luo Y, Shi M, Khan M O, et al. Fairclip: harnessing fairness in vision-language learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 12289\u201312301"},{"key":"4676_CR206","unstructured":"Luo W, Zhang Q, Lu T, et al. Doxing via the lens: revealing privacy leakage in image geolocation for agentic multi-modal large reasoning model. 2025. ArXiv:2504.19373"},{"key":"4676_CR207","unstructured":"Chen Y, Mendes E, Das S, et al. Can language models be instructed to protect personal information? 2023. ArXiv:2310.02224"},{"key":"4676_CR208","doi-asserted-by":"crossref","unstructured":"Shi Y, Gao Y, Lai Y, et al. Shield: an evaluation benchmark for face spoofing and forgery detection with multimodal large language models. 2024. ArXiv:2402.04178","DOI":"10.1007\/s44267-025-00079-w"},{"key":"4676_CR209","doi-asserted-by":"crossref","unstructured":"Chandna B, Aboujenane M, Naseem U. Extremeaigc: benchmarking LMM vulnerability to AI-generated extremist content. 2025. ArXiv:2503.09964","DOI":"10.18653\/v1\/2025.findings-emnlp.1176"},{"key":"4676_CR210","doi-asserted-by":"crossref","unstructured":"Wang S, Long Z, Fan Z, et al. From LLMs to MLLMs: exploring the landscape of multimodal jailbreaking. 2024. ArXiv:2406.14859","DOI":"10.18653\/v1\/2024.emnlp-main.973"},{"key":"4676_CR211","unstructured":"Guo Y, Jiao F, Nie L, et al. The VLLM safety paradox: dual ease in jailbreak attack and defense. 2024. ArXiv:2411.08410"},{"key":"4676_CR212","unstructured":"Ying Z, Liu A, Liu X, et al. Unveiling the safety of GPT-4o: an empirical study using jailbreak attacks. 2024. ArXiv:2406.06302"},{"key":"4676_CR213","first-page":"386","volume-title":"Proceedings of European Conference on Computer Vision","author":"X Liu","year":"2024","unstructured":"Liu X, Zhu Y, Gu J, et al. Mm-safetybench: a benchmark for safety evaluation of multimodal large language models. In: Proceedings of European Conference on Computer Vision, 2024. 386\u2013403"},{"key":"4676_CR214","doi-asserted-by":"publisher","first-page":"4035","DOI":"10.18653\/v1\/D18-1437","volume-title":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","author":"A Rohrbach","year":"2018","unstructured":"Rohrbach A, Hendricks L A, Burns K, et al. Object hallucination in image captioning. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, 2018. 4035\u20134045"},{"key":"4676_CR215","unstructured":"Downer G, Craven S, Ruck D, et al. Text2vlm: adapting text-only datasets to evaluate alignment training in visual language models. 2025. ArXiv:2507.20704"},{"key":"4676_CR216","unstructured":"Li X, Zhou H, Wang R, et al. Mossbench: is your multimodal language model oversensitive to safe queries? 2024. ArXiv:2406.17806"},{"key":"4676_CR217","doi-asserted-by":"publisher","first-page":"1725","DOI":"10.18653\/v1\/2023.eacl-main.126","volume-title":"Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics","author":"S Janghorbani","year":"2023","unstructured":"Janghorbani S, De Melo G. Multi-modal bias: introducing a framework for stereotypical bias assessment beyond gender and race in vision\u2013language models. In: Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, 2023. 1725\u20131735"},{"key":"4676_CR218","unstructured":"Lu P, Bansal H, Xia T, et al. Mathvista: evaluating mathematical reasoning of foundation models in visual contexts. 2023. ArXiv:2310.02255"},{"key":"4676_CR219","unstructured":"Gupta H, Verma S, Anantheswaran U, et al. Polymath: a challenging multi-modal mathematical reasoning benchmark. 2024. ArXiv:2410.14702"},{"key":"4676_CR220","first-page":"95095","volume":"37","author":"K Wang","year":"2024","unstructured":"Wang K, Pan J, Shi W, et al. Measuring multimodal mathematical reasoning with math-vision dataset. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 95095\u201395169","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR221","unstructured":"He C, Luo R, Bai Y, et al. Olympiadbench: a challenging benchmark for promoting AGI with olympiad-level bilingual multimodal scientific problems. 2024. ArXiv:2402.14008"},{"key":"4676_CR222","unstructured":"Wang Y, Zhang P, Tang J, et al. Polymath: evaluating mathematical reasoning in multilingual contexts. 2025. ArXiv:2504.18428"},{"key":"4676_CR223","first-page":"169","volume-title":"Proceedings of European Conference on Computer Vision","author":"R Zhang","year":"2024","unstructured":"Zhang R, Jiang D, Zhang Y, et al. Mathverse: does your multi-modal LLM truly see the diagrams in visual math problems? In: Proceedings of European Conference on Computer Vision, 2024. 169\u2013186"},{"key":"4676_CR224","unstructured":"Qiao R, Tan Q, Dong G, et al. We-math: does your large multimodal model achieve human-like mathematical reasoning? 2024. ArXiv:2407.01284"},{"key":"4676_CR225","unstructured":"Zhou M, Liang H, Li T, et al. Mathscape: evaluating MLLMs in multimodal math scenarios through a hierarchical benchmark. 2024. ArXiv:2408.07543"},{"key":"4676_CR226","unstructured":"Liu W, Pan Q, Zhang Y, et al. Cmm-math: a Chinese multimodal math dataset to evaluate and enhance the mathematics reasoning of large multimodal models. 2024. ArXiv:2409.02834"},{"key":"4676_CR227","first-page":"19541","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"P Wang","year":"2025","unstructured":"Wang P, Li Z Z, Yin F, et al. Mv-math: evaluating multimodal math reasoning in multi-visual contexts. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 19541\u201319551"},{"key":"4676_CR228","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"A Kembhavi","year":"2016","unstructured":"Kembhavi A, Salvato M, Kolve E, et al. Diagram understanding in geometry questions. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2016"},{"key":"4676_CR229","doi-asserted-by":"crossref","unstructured":"Anand A, Kapuriya J, Singh A, et al. Mm-phyqa: multimodal physics question answering with multi-image cot prompting. 2024. ArXiv:2404.08704","DOI":"10.1007\/978-981-97-2262-4_5"},{"key":"4676_CR230","unstructured":"Wang L, Su E, Liu J, et al. Physunibench: an undergraduate-level physics reasoning benchmark for multimodal models. 2025. ArXiv:2506.17667"},{"key":"4676_CR231","doi-asserted-by":"crossref","unstructured":"Dai S, Yan Y, Su J, et al. Physicsarena: the first multimodal physics reasoning benchmark exploring variable, process, and solution dimensions. 2025. ArXiv:2505.15472","DOI":"10.18653\/v1\/2025.findings-emnlp.937"},{"key":"4676_CR232","unstructured":"Xiang K, Li H, Zhang T J, et al. Seephys: does seeing help thinking? Benchmarking vision-based physics reasoning. 2025. ArXiv:2505.19099"},{"key":"4676_CR233","unstructured":"Zhang X, Dong Y, Wu Y, et al. Physreason: a comprehensive benchmark towards physics-based reasoning. 2025. ArXiv:2502.12054"},{"key":"4676_CR234","first-page":"109","volume-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, Bangkok","author":"Z Liang","year":"2024","unstructured":"Liang Z, Guo K, Liu G, et al. Scemqa: a scientific college entrance level multimodal question answering benchmark. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics, Bangkok, 2024. 109\u2013119"},{"key":"4676_CR235","doi-asserted-by":"crossref","unstructured":"Yu S, Wu P, Liang P P, et al. PACS: a dataset for physical audiovisual commonsense reasoning. 2022. ArXiv:2203.11130","DOI":"10.1007\/978-3-031-19836-6_17"},{"key":"4676_CR236","unstructured":"Jassim S, Holubar M, Richter A, et al. GRASP: a novel benchmark for evaluating language grounding and situated physics understanding in multimodal language models. 2023. ArXiv:2311.09048"},{"key":"4676_CR237","unstructured":"Foss A, Evans C, Mitts S, et al. Causalvqa: a physically grounded causal reasoning benchmark for video models. 2025. ArXiv:2506.09943"},{"key":"4676_CR238","unstructured":"Shabtay N, Polo F M, Doveh S, et al. Livexiv\u2014a multi-modal live benchmark based on arxiv papers content. 2024. ArXiv:2410.10783"},{"key":"4676_CR239","first-page":"4999","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"A Kembhavi","year":"2017","unstructured":"Kembhavi A, Seo M, Schwenk D, et al. Are you smarter than a sixth grader? Textbook question answering for multimodal machine comprehension. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), 2017. 4999\u20135007"},{"key":"4676_CR240","doi-asserted-by":"publisher","first-page":"31","DOI":"10.1021\/ci00057a005","volume":"28","author":"D Weininger","year":"1988","unstructured":"Weininger D. SMILES, a chemical language and information system. 1. Introduction to methodology and encoding rules. J Chem Inf Comput Sci, 1988, 28: 31\u201336","journal-title":"J Chem Inf Comput Sci"},{"key":"4676_CR241","doi-asserted-by":"publisher","first-page":"595","DOI":"10.18653\/v1\/2021.emnlp-main.47","volume-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","author":"C Edwards","year":"2021","unstructured":"Edwards C, Zhai C, Ji H. Text2mol: cross-modal molecule retrieval with natural language queries. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, 2021. 595\u2013607"},{"key":"4676_CR242","unstructured":"Zhang D, Liu W, Tan Q, et al. Chemllm: a chemical large language model. 2024. ArXiv:2402.06852"},{"key":"4676_CR243","doi-asserted-by":"publisher","first-page":"045024","DOI":"10.1088\/2632-2153\/aba947","volume":"1","author":"M Krenn","year":"2020","unstructured":"Krenn M, H\u00e4se F, Nigam A, et al. Self-referencing embedded strings (selfies): a 100% robust molecular string representation. Mach Learn Sci Tech, 2020, 1: 045024","journal-title":"Mach Learn Sci Tech"},{"key":"4676_CR244","doi-asserted-by":"publisher","first-page":"23","DOI":"10.1186\/s13321-015-0068-4","volume":"7","author":"S R Heller","year":"2015","unstructured":"Heller S R, McNaught A, Pletnev I, et al. InChI, the IUPAC international chemical identifier. J Cheminform, 2015, 7: 23","journal-title":"J Cheminform"},{"key":"4676_CR245","doi-asserted-by":"publisher","first-page":"108073","DOI":"10.1016\/j.compbiomed.2024.108073","volume":"171","author":"P Liu","year":"2024","unstructured":"Liu P, Ren Y, Tao J, et al. GIT-Mol: a multi-modal large language model for molecular science with graph, image, and text. Comput Biol Med, 2024, 171: 108073","journal-title":"Comput Biol Med"},{"key":"4676_CR246","unstructured":"Cao H, Liu Z, Lu X, et al. Instructmol: multi-modal integration for building a versatile and reliable molecular assistant in drug discovery. 2023. ArXiv:2311.16208"},{"key":"4676_CR247","first-page":"415","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"J Li","year":"2025","unstructured":"Li J, Zhang D, Wang X, et al. Chemvlm: exploring the power of multimodal large language models in chemistry area. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2025. 415\u2013423"},{"key":"4676_CR248","doi-asserted-by":"publisher","first-page":"315","DOI":"10.1038\/s42256-024-00977-6","volume":"7","author":"P Liu","year":"2025","unstructured":"Liu P, Tao J, Ren Z. A quantitative analysis of knowledge-learning preferences in large language models in molecular science. Nat Mach Intell, 2025, 7: 315\u2013327","journal-title":"Nat Mach Intell"},{"key":"4676_CR249","unstructured":"Alampara N, Schilling-Wilhelmi M, R\u00edos-Garc\u00eda M, et al. Probing the limitations of multimodal language models for chemistry and materials research. 2024. ArXiv:2411.16955"},{"key":"4676_CR250","unstructured":"Li S, Liu Z, Luo Y, et al. Towards 3d molecule-text interpretation in language models. 2024. ArXiv:2401.13923"},{"key":"4676_CR251","first-page":"110010","volume":"37","author":"R Bushuiev","year":"2024","unstructured":"Bushuiev R, Bushuiev A, de Jonge N, et al. MassSpecGym: a benchmark for the discovery and identification of molecules. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 110010\u2013110027","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR252","doi-asserted-by":"publisher","first-page":"125780","DOI":"10.52202\/079017-3996","volume":"37","author":"M Alberts","year":"2024","unstructured":"Alberts M, Schilter O, Zipoli F, et al. Unraveling molecular structure: a multimodal spectroscopic dataset for chemistry. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 125780\u2013125808","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR253","first-page":"134721","volume":"37","author":"K Guo","year":"2024","unstructured":"Guo K, Nan B, Zhou Y, et al. Can LLMs solve molecule puzzles? A multimodal benchmark for molecular structure elucidation. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 134721\u2013134746","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR254","unstructured":"Le K, Guo Z, Dong K, et al. Molx: enhancing large language models for molecular learning with a multi-modal extension. 2024. ArXiv:2406.06777"},{"key":"4676_CR255","doi-asserted-by":"publisher","first-page":"2550006","DOI":"10.1142\/S0219720025500064","volume":"23","author":"S Guo","year":"2025","unstructured":"Guo S, Wang L, Jin C, et al. M3-20M: a large-scale multi-modal molecule dataset for AI-driven drug design and discovery. J Bioinform Comput Biol, 2025, 23: 2550006","journal-title":"J Bioinform Comput Biol"},{"key":"4676_CR256","volume-title":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (ACL)","author":"J Luo","year":"2025","unstructured":"Luo J, Kou Z, Yang L, et al. Finmme: benchmark dataset for financial multi-modal reasoning evaluation. In: Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (ACL), 2025"},{"key":"4676_CR257","unstructured":"Xue S, Li X, Zhou F, et al. Famma: a benchmark for financial domain multilingual multimodal question answering. 2024. ArXiv:2410.04526"},{"key":"4676_CR258","unstructured":"Gan Z, Lu Y, Zhang D, et al. Mme-finance: a multimodal finance benchmark for expert-level understanding and reasoning. 2024. ArXiv:2411.03314"},{"key":"4676_CR259","unstructured":"Peng X, Qian L, Wang Y, et al. Multifinben: a multilingual, multimodal, and difficulty-aware benchmark for financial LLM evaluation. 2025. ArXiv:2506.14028"},{"key":"4676_CR260","unstructured":"Zeng L, Lou F, Wang Z, et al. Fingaia: an end-to-end benchmark for evaluating AI agents in finance. 2025. ArXiv:2507.17186"},{"key":"4676_CR261","unstructured":"Li J, Zhu Y, Cheng D, et al. Cfbenchmark-mm: Chinese financial assistant benchmark for multimodal large language model. 2025. ArXiv:2506.13055"},{"key":"4676_CR262","unstructured":"Tang Z, Liu J, Yang Z, et al. Finmmr: make financial numerical reasoning more multimodal, comprehensive, and challenging. 2025. ArXiv:2508.04625"},{"key":"4676_CR263","first-page":"785","volume-title":"Proceedings of Companion Proceedings of the ACM Web Conference","author":"A Rangapur","year":"2025","unstructured":"Rangapur A, Wang H, Jian L, et al. Fin-fact: a benchmark dataset for multimodal financial fact-checking and explanation generation. In: Proceedings of Companion Proceedings of the ACM Web Conference, 2025. 785\u2013788"},{"key":"4676_CR264","volume-title":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (ACL)","author":"S Kim","year":"2025","unstructured":"Kim S, Kim C, Kim T. Fcmr: robust evaluation of financial cross-modal multi-hop reasoning. In: Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (ACL), 2025"},{"key":"4676_CR265","doi-asserted-by":"crossref","unstructured":"Bhatia G, Nagoudi E M B, Cavusoglu H, et al. Fintral: a family of GPT-4 level multimodal financial large language models. 2024. ArXiv:2402.10986","DOI":"10.18653\/v1\/2024.findings-acl.774"},{"key":"4676_CR266","unstructured":"Huang J, Xiao M, Li D, et al. Open-finllms: open multimodal large language models for financial applications. 2024. ArXiv:2408.11878"},{"key":"4676_CR267","doi-asserted-by":"crossref","unstructured":"Wu S, Koo M, Blum L, et al. A comparative study of open-source large language models, GPT-4 and Claude 2: multiple-choice test taking in nephrology. 2023. ArXiv:2308.04709","DOI":"10.1056\/AIdbp2300092"},{"key":"4676_CR268","unstructured":"Liu M, Ding J, Xu J, et al. Medbench: a comprehensive, standardized, and reliable benchmarking system for evaluating Chinese medical large language models. 2024. ArXiv:2407.10990"},{"key":"4676_CR269","unstructured":"Chen H, Fang Z, Singla Y, et al. Benchmarking large language models on answering and explaining challenging medical questions. 2024. ArXiv:2402.18060"},{"key":"4676_CR270","doi-asserted-by":"publisher","first-page":"170","DOI":"10.1038\/s41597-023-02068-4","volume":"10","author":"A Krithara","year":"2023","unstructured":"Krithara A, Nentidis A, Bougiatiotis K, et al. BioASQ-QA: a manually curated corpus for biomedical question answering. Sci Data, 2023, 10: 170","journal-title":"Sci Data"},{"key":"4676_CR271","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics","author":"N Zhang","year":"2022","unstructured":"Zhang N, Chen M, Bi Z, et al. CBLUE: a Chinese biomedical language understanding evaluation benchmark. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics, 2022"},{"key":"4676_CR272","unstructured":"Jin Q, Dhingra B, Liu Z, et al. Pubmedqa: a dataset for biomedical research question answering. 2019. ArXiv:1909.06146"},{"key":"4676_CR273","volume-title":"Proceedings of ICLR","author":"E Trop","year":"2024","unstructured":"Trop E, Schiff Y, Marroquin E M, et al. The genomics long-range benchmark: advancing DNA language models. In: Proceedings of ICLR, 2024"},{"key":"4676_CR274","first-page":"22170","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Hu","year":"2024","unstructured":"Hu Y, Li T, Lu Q, et al. Omnimedvqa: a new large-scale comprehensive evaluation benchmark for medical lvlm. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 22170\u201322183"},{"key":"4676_CR275","first-page":"94327","volume":"37","author":"J Ye","year":"2024","unstructured":"Ye J, Wang G, Li Y, et al. Gmai-mmbench: a comprehensive multimodal evaluation benchmark towards general medical AI. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 94327\u201394427","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR276","unstructured":"Zuo Y, Qu S, Li Y, et al. Medxpertqa: benchmarking expert-level medical reasoning and understanding. 2025. ArXiv:2501.18362"},{"key":"4676_CR277","doi-asserted-by":"publisher","first-page":"180251","DOI":"10.1038\/sdata.2018.251","volume":"5","author":"J J Lau","year":"2018","unstructured":"Lau J J, Gayen S, Ben Abacha A, et al. A dataset of clinically generated visual questions and answers about radiology images. Sci Data, 2018, 5: 180251","journal-title":"Sci Data"},{"key":"4676_CR278","doi-asserted-by":"crossref","unstructured":"He X, Zhang Y, Mou L, et al. Pathvqa: 30000+ questions for medical visual question answering. 2020. ArXiv:2003.10286","DOI":"10.36227\/techrxiv.13127537"},{"key":"4676_CR279","first-page":"1650","volume-title":"Proceedings of 2021 IEEE 18th International Symposium on Biomedical Imaging (ISBI)","author":"B Liu","year":"2021","unstructured":"Liu B, Zhan L M, Xu L, et al. Slake: a semantically-labeled knowledge-enhanced dataset for medical visual question answering. In: Proceedings of 2021 IEEE 18th International Symposium on Biomedical Imaging (ISBI), 2021. 1650\u20131654"},{"key":"4676_CR280","first-page":"36722","volume":"35","author":"Y Ji","year":"2022","unstructured":"Ji Y, Bai H, Ge C, et al. Amos: a large-scale abdominal multi-organ benchmark for versatile medical image segmentation. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 35: 36722\u201336732","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR281","doi-asserted-by":"publisher","first-page":"10147","DOI":"10.1038\/s41467-024-54424-6","volume":"15","author":"Q Zheng","year":"2024","unstructured":"Zheng Q, Zhao W, Wu C, et al. Large-scale long-tailed disease diagnosis on radiology images. Nat Commun, 2024, 15: 10147","journal-title":"Nat Commun"},{"key":"4676_CR282","doi-asserted-by":"publisher","unstructured":"Hou W, Ji Z. Geneturing tests GPT models in genomics. BioRxiv, 2023, doi: https:\/\/doi.org\/10.1101\/2023.03.11.532238","DOI":"10.1101\/2023.03.11.532238"},{"key":"4676_CR283","unstructured":"Arora R K, Wei J, Hicks R S, et al. Healthbench: evaluating large language models towards improved human health. 2025. ArXiv:2505.08775"},{"key":"4676_CR284","volume-title":"Baichuan-Omni-1.5 Technical Report","author":"Y Li","year":"2025","unstructured":"Li Y, Liu J, Zhang T, et al. Baichuan-Omni-1.5 Technical Report. 2025. ArXiv:2501.15368"},{"key":"4676_CR285","doi-asserted-by":"publisher","first-page":"931","DOI":"10.1038\/nmeth.3547","volume":"12","author":"J Zhou","year":"2015","unstructured":"Zhou J, Troyanskaya O G. Predicting effects of noncoding variants with deep learning-based sequence model. Nat Methods, 2015, 12: 931\u2013934","journal-title":"Nat Methods"},{"key":"4676_CR286","doi-asserted-by":"publisher","first-page":"D884","DOI":"10.1093\/nar\/gkaa942","volume":"49","author":"K L Howe","year":"2021","unstructured":"Howe K L, Achuthan P, Allen J, et al. Ensembl 2021. Nucleic Acids Res, 2021, 49: D884\u2013D891","journal-title":"Nucleic Acids Res"},{"key":"4676_CR287","doi-asserted-by":"crossref","unstructured":"Yin M, Qu Y, Liu D, et al. Genome-bench: a scientific reasoning benchmark from real-world expert discussions. 2025. ArXiv:2505.19501","DOI":"10.1101\/2025.06.02.657538"},{"key":"4676_CR288","doi-asserted-by":"publisher","first-page":"2613","DOI":"10.1038\/s41591-024-03097-1","volume":"30","author":"P Hager","year":"2024","unstructured":"Hager P, Jungmann F, Holland R, et al. Evaluation and mitigation of the limitations of large language models in clinical decision-making. Nat Med, 2024, 30: 2613\u20132622","journal-title":"Nat Med"},{"key":"4676_CR289","volume-title":"Medgemma technical report","author":"A Sellergren","year":"2025","unstructured":"Sellergren A, Kazemzadeh S, Jaroensri T, et al. Medgemma technical report. 2025. ArXiv:2507.05201"},{"key":"4676_CR290","unstructured":"Si C, Zhang Y, Li R, et al. Design2code: benchmarking multimodal code generation for automated front-end engineering. 2024. ArXiv:2403.03163"},{"key":"4676_CR291","first-page":"112134","volume":"37","author":"S Yun","year":"2024","unstructured":"Yun S, Thushara R, Bhat M, et al. Web2code: a large-scale webpage-to-code dataset and evaluation framework for multimodal LLMs. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 112134\u2013112157","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR292","doi-asserted-by":"crossref","unstructured":"Wu C, Ge Y, Guo Q, et al. Plot2code: a comprehensive benchmark for evaluating multi-modal large language models in code generation from scientific plots. 2024. ArXiv:2405.07990","DOI":"10.18653\/v1\/2025.findings-naacl.164"},{"key":"4676_CR293","unstructured":"Yang C, Shi C, Liu Y, et al. Chartmimic: evaluating LMM\u2019s cross-modal reasoning capability via chart-to-code generation. 2024. ArXiv:2406.09961"},{"key":"4676_CR294","unstructured":"Wang H, Zhou X, Xu Z, et al. Code-vision: evaluating multimodal LLMs logic understanding and code generation capabilities. 2025. ArXiv:2502.11829"},{"key":"4676_CR295","unstructured":"Yang J, Jimenez C E, Zhang A L, et al. Swe-bench multimodal: do AI systems generalize to visual software domains? 2024. ArXiv:2410.03859"},{"key":"4676_CR296","unstructured":"Zhang L, Zan D, Yang Q, et al. Codev: issue resolving with visual data. 2024. ArXiv:2412.17315"},{"key":"4676_CR297","doi-asserted-by":"crossref","unstructured":"Li K, Tian Y, Hu Q, et al. Mmcode: benchmarking multimodal large language models for code generation with visually rich programming problems. 2024. ArXiv:2404.09486","DOI":"10.18653\/v1\/2024.findings-emnlp.42"},{"key":"4676_CR298","unstructured":"Zhang F, Wu L, Bai H, et al. Humaneval-v: benchmarking high-level visual reasoning with complex diagrams in coding tasks. 2024. ArXiv:2410.12381"},{"key":"4676_CR299","unstructured":"Rodriguez J, Jian X, Panigrahi S S, et al. Bigdocs: an open dataset for training multimodal models on document and code tasks. 2024. ArXiv:2412.04626"},{"key":"4676_CR300","unstructured":"Chai L, Yang J, Liu S, et al. Multilingual multimodal software developer for code generation. 2025. ArXiv:2507.08719"},{"key":"4676_CR301","first-page":"7513","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"E Sachdeva","year":"2024","unstructured":"Sachdeva E, Agarwal N, Chundi S, et al. Rank2tell: a multimodal driving dataset for joint importance ranking and reasoning. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2024. 7513\u20137522"},{"key":"4676_CR302","first-page":"1043","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"S Malla","year":"2023","unstructured":"Malla S, Choi C, Dwivedi I, et al. Drama: joint risk localization and captioning in driving. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2023. 1043\u20131052"},{"key":"4676_CR303","first-page":"4542","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"T Qian","year":"2024","unstructured":"Qian T, Chen J, Zhuo L, et al. Nuscenes-qa: a multi-modal visual question answering benchmark for autonomous driving scenario. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2024. 4542\u20134550"},{"key":"4676_CR304","first-page":"252","volume-title":"Proceedings of European Conference on Computer Vision","author":"A M Marcu","year":"2024","unstructured":"Marcu A M, Chen L, H\u00fcnermann J, et al. Lingoqa: visual question answering for autonomous driving. In: Proceedings of European Conference on Computer Vision, 2024. 252\u2013269"},{"key":"4676_CR305","unstructured":"Chiu H k, Hachiuma R, Wang C Y, et al. V2v-llm: vehicle-to-vehicle cooperative autonomous driving with multi-modal large language models. 2025. ArXiv:2502.09980"},{"key":"4676_CR306","unstructured":"Guo X, Zhang R, Duan Y, et al. Surds: benchmarking spatial understanding and reasoning in driving scenarios with vision language models. 2024. ArXiv:2411.13112"},{"key":"4676_CR307","unstructured":"Wei Z, Qiang C, Jiang B, et al. Ad2-bench: a hierarchical cot benchmark for MLLM in autonomous driving under adverse conditions. 2025. ArXiv:2506.09557"},{"key":"4676_CR308","unstructured":"Hao Y, Li Z, Sun L, et al. Driveaction: a benchmark for exploring human-like driving decisions in VLA models. 2025. ArXiv:2506.05667"},{"key":"4676_CR309","unstructured":"Ishaq A, Lahoud J, More K, et al. Drivelmm-o1: a step-by-step reasoning dataset and large multimodal model for driving scenario understanding. 2025. ArXiv:2503.10621"},{"key":"4676_CR310","unstructured":"Tian X, Gu J, Li B, et al. Drivevlm: the convergence of autonomous driving and large vision-language models. 2024. ArXiv:2402.12289"},{"key":"4676_CR311","unstructured":"Xiao B, Feng C, Huang Z, et al. Robotron-sim: improving real-world driving via simulated hard-case. 2025. ArXiv:2508.04642"},{"key":"4676_CR312","first-page":"5838","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","author":"Y Lu","year":"2025","unstructured":"Lu Y, Yao Y, Tu J, et al. Can lvlms obtain a driver\u2019s license? A benchmark towards reliable AGI for autonomous driving. In: Proceedings of the AAAI Conference on Artificial Intelligence, 2025. 5838\u20135846"},{"key":"4676_CR313","unstructured":"Li Y, Tian M, Lin Z, et al. Fine-grained evaluation of large vision-language models in autonomous driving. 2025. ArXiv:2503.21505"},{"key":"4676_CR314","doi-asserted-by":"publisher","first-page":"111367","DOI":"10.1016\/j.dib.2025.111367","volume":"59","author":"K Rekanar","year":"2025","unstructured":"Rekanar K, Joyce J M, Hayes M, et al. DriVQA: a gaze-based dataset for visual question answering in driving scenarios. Data Brief, 2025, 59: 111367","journal-title":"Data Brief"},{"key":"4676_CR315","volume-title":"Proceedings of ICLR","author":"L Wen","year":"2024","unstructured":"Wen L, Yang X, Fu D, et al. On the road with GPT-4v (ision): explorations of utilizing visual-language model as autonomous driving agent. In: Proceedings of ICLR, 2024"},{"key":"4676_CR316","first-page":"21819","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"X Cao","year":"2024","unstructured":"Cao X, Zhou T, Ma Y, et al. Maplm: a real-world large-scale vision-language benchmark for map and traffic scene understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024. 21819\u201321830"},{"key":"4676_CR317","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1145\/3616855.3635772","volume-title":"Proceedings of the 17th ACM International Conference on Web Search and Data Mining","author":"C Deng","year":"2024","unstructured":"Deng C, Zhang T, He Z, et al. K2: a foundation language model for geoscience knowledge understanding and utilization. In: Proceedings of the 17th ACM International Conference on Web Search and Data Mining, 2024. 161\u2013170"},{"key":"4676_CR318","unstructured":"Manivannan V V, Jafari Y, Eranky S, et al. Climaqa: an automated evaluation framework for climate foundation models. 2024. ArXiv:2410.16701"},{"key":"4676_CR319","doi-asserted-by":"crossref","unstructured":"Webersinke N, Kraus M, Bingler J A, et al. Climatebert: a pretrained language model for climate-related text. 2021. ArXiv:2110.12010","DOI":"10.2139\/ssrn.4229146"},{"key":"4676_CR320","unstructured":"Ma C, Hua Z, Anderson-Frey A, et al. Weatherqa: can multimodal language models reason about severe weather? 2024. ArXiv:2406.11217"},{"key":"4676_CR321","doi-asserted-by":"publisher","first-page":"272","DOI":"10.1016\/j.isprsjprs.2025.03.028","volume":"224","author":"Y Hu","year":"2025","unstructured":"Hu Y, Yuan J, Wen C, et al. RSGPT: a remote sensing vision language model and benchmark. ISPRS J Photogrammetry Remote Sens, 2025, 224: 272\u2013286","journal-title":"ISPRS J Photogrammetry Remote Sens"},{"key":"4676_CR322","unstructured":"Zhao X, Xu W, Liu B, et al. MSEarth: a benchmark for multimodal scientific comprehension of earth science. 2025. ArXiv:2505.20740"},{"key":"4676_CR323","doi-asserted-by":"publisher","first-page":"2183","DOI":"10.1109\/TGRS.2017.2776321","volume":"56","author":"X Lu","year":"2017","unstructured":"Lu X, Wang B, Zheng X, et al. Exploring models and data for remote sensing image caption generation. IEEE Trans Geosci Remote Sens, 2017, 56: 2183\u20132195","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"4676_CR324","first-page":"1","volume":"61","author":"Y Zhan","year":"2023","unstructured":"Zhan Y, Xiong Z, Yuan Y. Rsvg: exploring data and models for visual grounding on remote sensing data. IEEE Trans Geosci Remote Sensing, 2023, 61: 1\u201313","journal-title":"IEEE Trans Geosci Remote Sensing"},{"key":"4676_CR325","first-page":"3229","volume":"37","author":"X Li","year":"2024","unstructured":"Li X, Ding J, Elhoseiny M. Vrsbench: a versatile vision-language benchmark dataset for remote sensing image understanding. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 3229\u20133242","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR326","unstructured":"Luo J, Zhang Y, Yang X, et al. When large vision-language model meets large remote sensing imagery: coarse-to-fine text-guided token pruning. 2025. ArXiv:2503.07588"},{"key":"4676_CR327","first-page":"27831","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"K Kuckreja","year":"2024","unstructured":"Kuckreja K, Danish M S, Naseer M, et al. Geochat: grounded large vision-language model for remote sensing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 27831\u201327840"},{"key":"4676_CR328","unstructured":"Xu W, Zhao X, Zhou Y, et al. Earthse: a benchmark evaluating earth scientific exploration capability for large language models. 2025. ArXiv:2505.17139"},{"key":"4676_CR329","unstructured":"An X, Sun J, Gui Z, et al. Choice: benchmarking the remote sensing capabilities of large vision-language models. 2024. ArXiv:2411.18145"},{"key":"4676_CR330","unstructured":"Ma Z, Xiao X, Dong S, et al. Sarchat-bench-2m: a multi-task vision-language benchmark for SAR image interpretation. 2025. ArXiv:2502.08168"},{"key":"4676_CR331","first-page":"440","volume-title":"Proceedings of European Conference on Computer Vision. Springer","author":"D Muhtar","year":"2024","unstructured":"Muhtar D, Li Z, Gu F, et al. Lhrs-bot: empowering remote sensing with VGI-enhanced large multimodal language model. In: Proceedings of European Conference on Computer Vision. Springer, 2024. 440\u2013457"},{"key":"4676_CR332","unstructured":"Bi Z, Zhang N, Xue Y, et al. Oceangpt: a large language model for ocean science tasks. 2023. ArXiv:2310.02031"},{"key":"4676_CR333","unstructured":"Luo J, Pang Z, Zhang Y, et al. Skysensegpt: a fine-grained instruction tuning dataset and model for remote sensing vision-language understanding. 2024. ArXiv:2406.10100"},{"key":"4676_CR334","doi-asserted-by":"crossref","unstructured":"Zhang C, Wang S. Good at captioning, bad at counting: benchmarking GPT-4v on earth observation data. 2024. ArXiv:2401.17600","DOI":"10.1109\/CVPRW63382.2024.00780"},{"key":"4676_CR335","unstructured":"Danish M S, Munir M A, Shah S R A, et al. Geobench-vlm: benchmarking vision-language models for geospatial tasks. 2024. ArXiv:2411.19325"},{"key":"4676_CR336","unstructured":"Mall U, Phoo C, Liu M, et al. Remote sensing vision-language foundation models without annotations via ground remote alignment. 2023. ArXiv:2312.06960"},{"key":"4676_CR337","unstructured":"Wang F, Chen M, He X, et al. Omniearth-bench: towards holistic evaluation of earth\u2019s six spheres and cross-spheres interactions with multimodal observational earth data. 2025. ArXiv:2505.23522"},{"key":"4676_CR338","first-page":"1","volume":"60","author":"Q Cheng","year":"2022","unstructured":"Cheng Q, Huang H, Xu Y, et al. Nwpu-captions dataset and mlca-net for remote sensing image captioning. IEEE Trans Geosci Remote Sensing, 2022, 60: 1\u201319","journal-title":"IEEE Trans Geosci Remote Sensing"},{"key":"4676_CR339","doi-asserted-by":"publisher","first-page":"8555","DOI":"10.1109\/TGRS.2020.2988782","volume":"58","author":"S Lobry","year":"2020","unstructured":"Lobry S, Marcos D, Murray J, et al. RSVQA: visual question answering for remote sensing data. IEEE Trans Geosci Remote Sens, 2020, 58: 8555\u20138566","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"4676_CR340","first-page":"3674","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"P Anderson","year":"2018","unstructured":"Anderson P, Wu Q, Teney D, et al. Vision-and-language navigation: Interpreting visually-grounded navigation instructions in real environments. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018. 3674\u20133683"},{"key":"4676_CR341","first-page":"9982","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Y Qi","year":"2020","unstructured":"Qi Y, Wu Q, Anderson P, et al. Reverie: remote embodied visual referring expression in real indoor environments. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020. 9982\u20139991"},{"key":"4676_CR342","first-page":"18995","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"K Grauman","year":"2022","unstructured":"Grauman K, Westbury A, Byrne E, et al. Ego4d: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022. 18995\u201319012"},{"key":"4676_CR343","first-page":"19119","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"S Datta","year":"2022","unstructured":"Datta S, Dharur S, Cartillier V, et al. Episodic memory question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022. 19119\u201319128"},{"key":"4676_CR344","unstructured":"Ma X, Yong S, Zheng Z, et al. Sqa3d: Situated question answering in 3d scenes. 2022. ArXiv:2210.07474"},{"key":"4676_CR345","first-page":"16488","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"A Majumdar","year":"2024","unstructured":"Majumdar A, Ajay A, Zhang X, et al. Openeqa: embodied question answering in the era of foundation models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 16488\u201316498"},{"key":"4676_CR346","doi-asserted-by":"crossref","unstructured":"Ren A Z, Clark J, Dixit A, et al. Explore until confident: efficient exploration for embodied question answering. 2024. ArXiv:2403.15941","DOI":"10.15607\/RSS.2024.XX.089"},{"key":"4676_CR347","unstructured":"Chen Z, Shi Z, Lu X, et al. Rh20t-p: a primitive-level robotic dataset towards composable generalization agents. 2024. ArXiv:2403.19622"},{"key":"4676_CR348","first-page":"3343","volume":"35","author":"B Jia","year":"2022","unstructured":"Jia B, Lei T, Zhu S C, et al. Egotaskqa: understanding human tasks in egocentric videos. In: Proceedings of Advances in Neural Information Processing Systems, 2022. 35: 3343\u20133360","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR349","first-page":"19757","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"T Wang","year":"2024","unstructured":"Wang T, Mao X, Zhu C, et al. Embodiedscan: a holistic multi-modal 3d perception suite towards embodied AI. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 19757\u201319767"},{"key":"4676_CR350","unstructured":"Cheng Z, Tu Y, Li R, et al. Embodiedeval: evaluate multimodal LLMs as embodied agents. 2025. ArXiv:2501.11858"},{"key":"4676_CR351","unstructured":"Jiang K, Liu Y, Chen W, et al. Beyond the destination: a novel benchmark for exploration-aware embodied question answering. 2025. ArXiv:2503.11117"},{"key":"4676_CR352","unstructured":"Yang R, Chen H, Zhang J, et al. Embodiedbench: comprehensive benchmarking multi-modal large language models for vision-driven embodied agents. 2025. ArXiv:2502.09560"},{"key":"4676_CR353","unstructured":"Zhang S, Xu Z, Liu P, et al. Vlabench: a large-scale benchmark for language-conditioned robotics manipulation with long-horizon reasoning tasks. 2024. ArXiv:2412.18194"},{"key":"4676_CR354","unstructured":"Yue H, Huang S, Liao Y, et al. Ewmbench: evaluating scene, motion, and semantic quality in embodied world models. 2025. ArXiv:2505.09694"},{"key":"4676_CR355","first-page":"720","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV)","author":"D Damen","year":"2018","unstructured":"Damen D, Doughty H, Farinella G M, et al. Scaling egocentric vision: the epic-kitchens dataset. In: Proceedings of the European Conference on Computer Vision (ECCV), 2018. 720\u2013736"},{"key":"4676_CR356","doi-asserted-by":"publisher","first-page":"7327","DOI":"10.1109\/LRA.2022.3180108","volume":"7","author":"O Mees","year":"2022","unstructured":"Mees O, Hermann L, Rosete-Beas E, et al. CALVIN: a benchmark for language-conditioned policy learning for long-horizon robot manipulation tasks. IEEE Robot Autom Lett, 2022, 7: 7327\u20137334","journal-title":"IEEE Robot Autom Lett"},{"key":"4676_CR357","first-page":"312","volume-title":"Proceedings of European Conference on Computer Vision","author":"A Burns","year":"2022","unstructured":"Burns A, Arsan D, Agrawal S, et al. A dataset for interactive vision-language navigation with unknown command feasibility. In: Proceedings of European Conference on Computer Vision, 2022. 312\u2013328"},{"key":"4676_CR358","first-page":"10740","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"M Shridhar","year":"2020","unstructured":"Shridhar M, Thomason J, Gordon D, et al. Alfred: a benchmark for interpreting grounded instructions for everyday tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020. 10740\u201310749"},{"key":"4676_CR359","first-page":"1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"A Das","year":"2018","unstructured":"Das A, Datta S, Gkioxari G, et al. Embodied question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018. 1\u201310"},{"key":"4676_CR360","volume-title":"Neurips 2025 embodied agent interface challenge","author":"T Q J Frellsen","year":"2025","unstructured":"Frellsen T Q J, Zhang K. Neurips 2025 embodied agent interface challenge. 2025. https:\/\/blog.neurips.cc\/2025\/06\/27\/neurips-2025-competitions-announced\/"},{"key":"4676_CR361","doi-asserted-by":"crossref","unstructured":"Wang Z J, Montoya E, Munechika D, et al. Diffusiondb: a large-scale prompt gallery dataset for text-to-image generative models. 2022. ArXiv:2210.14896","DOI":"10.18653\/v1\/2023.acl-long.51"},{"key":"4676_CR362","first-page":"2096","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"X Wu","year":"2023","unstructured":"Wu X, Sun K, Zhu F, et al. Human preference score: better aligning text-to-image models with human preference. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2023. 2096\u20132105"},{"key":"4676_CR363","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"J Xu","year":"2024","unstructured":"Xu J, Liu X, Wu Y, et al. Imagereward: learning and evaluating human preferences for text-to-image generation. In: Proceedings of Advances in Neural Information Processing Systems, 2024"},{"key":"4676_CR364","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"Y Kirstain","year":"2024","unstructured":"Kirstain Y, Polyak A, Singer U, et al. Pick-a-pic: an open dataset of user preferences for text-to-image generation. In: Proceedings of Advances in Neural Information Processing Systems, 2024"},{"key":"4676_CR365","doi-asserted-by":"publisher","first-page":"440","DOI":"10.1109\/ICMEW59549.2023.00082","volume-title":"Proceedings of 2023 IEEE International Conference on Multimedia and Expo Workshops (ICMEW)","author":"Z Zhang","year":"2023","unstructured":"Zhang Z, Li C, Sun W, et al. A perceptual quality assessment exploration for AIGC images. In: Proceedings of 2023 IEEE International Conference on Multimedia and Expo Workshops (ICMEW), 2023. 440\u2013445"},{"key":"4676_CR366","doi-asserted-by":"publisher","first-page":"6833","DOI":"10.1109\/TCSVT.2023.3319020","volume":"34","author":"C Li","year":"2024","unstructured":"Li C, Zhang Z, Wu H, et al. AGIQA-3K: an open database for AI-generated image quality assessment. IEEE Trans Circuits Syst Video Technol, 2024, 34: 6833\u20136846","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"4676_CR367","first-page":"46","volume-title":"Proceedings of CAAI International Conference on Artificial Intelligence","author":"J Wang","year":"2023","unstructured":"Wang J, Duan H, Liu J, et al. Aigciqa2023: a large-scale image quality assessment database for AI generated images: from the perspectives of quality, authenticity and correspondence. In: Proceedings of CAAI International Conference on Artificial Intelligence, 2023. 46\u201357"},{"key":"4676_CR368","unstructured":"Chen Z, Sun W, Wu H, et al. Exploring the naturalness of AI-generated images. 2024. ArXiv:2312.05476"},{"key":"4676_CR369","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops","author":"C Li","year":"2024","unstructured":"Li C, Kou T, Gao Y, et al. Aigiqa-20k: a large database for AI-generated image quality assessment. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, 2024"},{"key":"4676_CR370","doi-asserted-by":"crossref","unstructured":"Yang L, Duan H, Teng L, et al. Aigcoiqa2024: perceptual quality assessment of AI generated omnidirectional images. 2024. ArXiv:2404.01024","DOI":"10.1109\/ICIP51287.2024.10647885"},{"key":"4676_CR371","unstructured":"Li C, Wu X, Wu H, et al. Cmc-bench: towards a new paradigm of visual signal compression. 2024. ArXiv:2406.09356"},{"key":"4676_CR372","unstructured":"Yuan J, Cao X, Li C, et al. Pku-i2iqa: an image-to-image quality assessment database for AI generated images. 2023. ArXiv:2311.15556"},{"key":"4676_CR373","unstructured":"Yarom M, Bitton Y, Changpinyo S, et al. What you see is what you read? Improving text-image alignment evaluation. 2023. ArXiv:2305.10400"},{"key":"4676_CR374","unstructured":"Wang J, Duan H, Zhai G, et al. Quality assessment for AI generated images with instruction tuning. 2025. ArXiv:2405.07346"},{"key":"4676_CR375","doi-asserted-by":"crossref","unstructured":"Zhang Z, Kou T, Wang S, et al. Q-eval-100k: evaluating visual quality and alignment level for text-to-vision content. 2025. ArXiv:2503.02357","DOI":"10.1109\/CVPR52734.2025.00993"},{"key":"4676_CR376","unstructured":"Wu X, Hao Y, Sun K, et al. Human preference score v2: a solid benchmark for evaluating human preferences of text-to-image synthesis. 2023. ArXiv:2306.09341"},{"key":"4676_CR377","unstructured":"Chivileva I, Lynch P, Ward T E, et al. Measuring the quality of text-to-video model outputs: metrics and dataset. 2023. ArXiv:2309.08009"},{"key":"4676_CR378","unstructured":"Liu Y, Cun X, Liu X, et al. Evalcrafter: benchmarking and evaluating large video generation models. 2023. ArXiv:2310.11440"},{"key":"4676_CR379","unstructured":"Liu Y, Li L, Ren S, et al. Fetv: a benchmark for fine-grained evaluation of open-domain text-to-video generation. 2023. ArXiv:2311.01813"},{"key":"4676_CR380","unstructured":"Huang Z, He Y, Yu J, et al. Vbench: comprehensive benchmark suite for video generative models. 2023. ArXiv:2311.17982"},{"key":"4676_CR381","doi-asserted-by":"crossref","unstructured":"Kou T, Liu X, Zhang Z, et al. Subjective-aligned dataset and metric for text-to-video quality assessment. 2024. ArXiv:2403.11956","DOI":"10.1145\/3664647.3680868"},{"key":"4676_CR382","volume-title":"Proceedings of Advances in Neural Information Processing Systems (NeurIPS)","author":"Z Chen","year":"2024","unstructured":"Chen Z, Sun W, Tian Y, et al. Gaia: rethinking action quality assessment for ai-generated videos. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS), 2024"},{"key":"4676_CR383","first-page":"18869","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"J Wang","year":"2025","unstructured":"Wang J, Duan H, Zhai G, et al. Aigv-assessor: benchmarking and evaluating the perceptual quality of text-to-video generation with LMM. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 18869\u201318880"},{"key":"4676_CR384","unstructured":"Wang J, Duan H, Jia Z, et al. Love: benchmarking and evaluating text-to-video generation and video-to-text interpretation. 2025. ArXiv:2505.12098"},{"key":"4676_CR385","unstructured":"Zhang Z, Sun W, Li X, et al. Human-activity AGV quality assessment: a benchmark dataset and an objective evaluation metric. 2024. ArXiv:2411.16619"},{"key":"4676_CR386","unstructured":"Wang J, Wang J, Duan H, et al. Tdve-assessor: benchmarking and evaluating the quality of text-driven video editing with LMMS. 2025. ArXiv:2505.19535"},{"key":"4676_CR387","unstructured":"Cao Y, Min X, Gao Y, et al. Agav-rater: adapting large multimodal model for AI-generated audio-visual quality assessment. 2025. ArXiv:2501.18314"},{"key":"4676_CR388","unstructured":"Chen C, Hu Y, Wang S, et al. Audio large language models can be descriptive speech quality evaluators. 2025. ArXiv:2501.17202"},{"key":"4676_CR389","unstructured":"\u0141ajszczak M, C\u00e1mbara G, Li Y, et al. Base TTS: lessons from building a billion-parameter text-to-speech model on 100k hours of data. 2024. ArXiv:2402.08093"},{"key":"4676_CR390","unstructured":"Wang X, Zhao Z, Ren S, et al. Audio Turing test: benchmarking the human-likeness of large language model-based text-to-speech systems in Chinese. 2025. ArXiv:2505.11200"},{"key":"4676_CR391","unstructured":"Minixhofer C, Klejch O, Bell P. TTSDS2: resources and benchmark for evaluating human-quality text to speech systems. 2025. ArXiv:2506.19441"},{"key":"4676_CR392","unstructured":"Zhang Y, Cui B, Yang Q, et al. Benchmarking and learning multi-dimensional quality evaluator for text-to-3D generation. 2024. ArXiv:2412.11170"},{"key":"4676_CR393","first-page":"1","volume-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Y Zhou","year":"2025","unstructured":"Zhou Y, Zhang Z, Wen F, et al. 3dgcqa: a quality assessment database for 3d ai-generated contents. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2025. 1\u20135"},{"key":"4676_CR394","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2025.3604905","volume-title":"IEEE Trans Multimedia","author":"K Fu","year":"2025","unstructured":"Fu K, Duan H, Zhang Z, et al. Multi-dimensional quality assessment for text-to-3D assets: dataset and model. IEEE Trans Multimedia, 2025, doi: https:\/\/doi.org\/10.1109\/tmm.2025.3604905"},{"key":"4676_CR395","volume-title":"Proceedings of the IEEE International Conference on Multimedia and Expo","author":"K Fu","year":"2025","unstructured":"Fu K, Duan H, Zhang Z, et al. Si23dcqa: perceptual quality assessment of single image-to-3d content. In: Proceedings of the IEEE International Conference on Multimedia and Expo, 2025"},{"key":"4676_CR396","doi-asserted-by":"crossref","unstructured":"Xing Y, Wang J, Niu P, et al. 3dgs-ieval-15k: a large-scale image quality evaluation database for 3d Gaussian-splatting. 2025. ArXiv:2506.14642","DOI":"10.1145\/3746027.3758206"},{"key":"4676_CR397","first-page":"366","volume-title":"Proceedings of European Conference on Computer Vision","author":"Z Lin","year":"2024","unstructured":"Lin Z, Pathak D, Li B, et al. Evaluating text-to-visual generation with image-to-text generation. In: Proceedings of European Conference on Computer Vision, 2024. 366\u2013384"},{"key":"4676_CR398","first-page":"29","volume-title":"Proceedings of Advances in Neural Information Processing Systems","author":"T Salimans","year":"2016","unstructured":"Salimans T, Goodfellow I, Zaremba W, et al. Improved techniques for training GANs. In: Proceedings of Advances in Neural Information Processing Systems, 2016. 29"},{"key":"4676_CR399","unstructured":"Unterthiner T, van Steenkiste S, Kurach K, et al. Towards accurate generative models of video: a new metric & challenges. 2018. ArXiv:1812.01717"},{"key":"4676_CR400","unstructured":"Liu X, Min X, Zhai G, et al. Ntire 2024 quality assessment of AI-generated content challenge. 2024. ArXiv:2404.16687"},{"key":"4676_CR401","unstructured":"Wu H, Zhang Z, Zhang E, et al. Q-instruct: improving low-level visual abilities for multi-modality foundation models. 2023. ArXiv:2311.06783"},{"key":"4676_CR402","unstructured":"Wu H, Zhang Z, Zhang W, et al. Q-align: teaching LMMS for visual scoring via discrete text-defined levels. 2023. ArXiv:2312.17090"},{"key":"4676_CR403","unstructured":"Zhang Z, Wu H, Ji Z, et al. Q-boost: on visual quality assessment ability of low-level multi-modality foundation models. 2023. ArXiv:2312.15300"},{"key":"4676_CR404","first-page":"259","volume-title":"Proceedings of European Conference on Computer Vision","author":"Z You","year":"2024","unstructured":"You Z, Li Z, Gu J, et al. Depicting beyond scores: advancing image quality assessment through multi-modal language models. In: Proceedings of European Conference on Computer Vision, 2024. 259\u2013276"},{"key":"4676_CR405","unstructured":"Cui C, Chen K, Wei Z, et al. M3-agiqa: multimodal, multi-round, multi-aspect AI-generated image quality assessment. 2025. ArXiv:2502.15167"},{"key":"4676_CR406","doi-asserted-by":"crossref","unstructured":"Li C, Wu H, Zhang Z, et al. Q-refine: a perceptual quality refiner for AI-generated image. 2024. ArXiv:2401.01117","DOI":"10.1109\/ICME57554.2024.10687390"},{"key":"4676_CR407","doi-asserted-by":"crossref","unstructured":"Wang P, Sun W, Zhang Z, et al. Large multi-modality model assisted AI-generated image quality assessment. 2024. ArXiv:2404.17762","DOI":"10.1145\/3664647.3681471"},{"key":"4676_CR408","first-page":"6692","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Z Yu","year":"2024","unstructured":"Yu Z, Guan F, Lu Y, et al. Sf-iqa: quality and similarity integration for AI generated image quality assessment. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 6692\u20136701"},{"key":"4676_CR409","doi-asserted-by":"crossref","unstructured":"Li Q, Yan Q, Huang H, et al. Text-visual semantic constrained AI-generated image quality assessment. 2025. ArXiv:2507.10432","DOI":"10.1145\/3746027.3755471"},{"key":"4676_CR410","unstructured":"Xia J, He L, Gao F, et al. AI-generated image quality assessment based on task-specific prompt and multi-granularity similarity. 2024. ArXiv:2411.16087"},{"key":"4676_CR411","first-page":"6395","volume-title":"Proceedings of 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)","author":"J Yang","year":"2024","unstructured":"Yang J, Fu J, Zhang W, et al. Moe-agiqa: mixture-of-experts boosted visual perception-driven and semantic-aware quality assessment for AI-generated images. In: Proceedings of 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW), 2024. 6395\u20136404"},{"key":"4676_CR412","doi-asserted-by":"crossref","unstructured":"Zhou T, Tan S, Zhou W, et al. Adaptive mixed-scale feature fusion network for blind AI-generated image quality assessment. 2024. ArXiv:2404.15163","DOI":"10.1109\/TBC.2024.3391060"},{"key":"4676_CR413","unstructured":"Yuan J, Cao X, Cao L, et al. Pscr: patches sampling-based contrastive regression for AIGC image quality assessment. 2023. ArXiv:2312.05897"},{"key":"4676_CR414","unstructured":"Yuan J, Cao X, Che J, et al. Tier: text-image encoder-based regression for AIGC image quality assessment. 2024. ArXiv:2401.03854"},{"key":"4676_CR415","first-page":"6432","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"F Peng","year":"2024","unstructured":"Peng F, Fu H, Ming A, et al. AIGC image quality assessment via image-prompt correspondence. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 6432\u20136441"},{"key":"4676_CR416","unstructured":"Zhao X, Zhang P, Tang K, et al. Envisioning beyond the pixels: benchmarking reasoning-informed visual editing. 2025. ArXiv:2504.02826"},{"key":"4676_CR417","unstructured":"Fang R, Duan C, Wang K, et al. GoT: unleashing reasoning capability of multimodal large language model for visual generation and editing. 2025. ArXiv:2503.10639"},{"key":"4676_CR418","first-page":"8362","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Y Huang","year":"2024","unstructured":"Huang Y, Xie L, Wang X, et al. Smartedit: exploring complex instruction-based image editing with multimodal large language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024. 8362\u20138371"},{"key":"4676_CR419","unstructured":"Niu Y, Ning M, Zheng M, et al. WISE: a world knowledge-informed semantic evaluation for text-to-image generation. 2025. ArXiv:2503.07265"},{"key":"4676_CR420","unstructured":"Wu Y, Li Z, Hu X, et al. KRIS-Bench: benchmarking next-level intelligent image editing models. 2025. ArXiv:2505.16707"},{"key":"4676_CR421","volume-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Hyderabad","author":"M Kang","year":"2025","unstructured":"Kang M, Zhang X, Wei F, et al. Enhancing image editing with chain-of-thought reasoning and multimodal large language models. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Hyderabad, 2025"},{"key":"4676_CR422","doi-asserted-by":"publisher","first-page":"12873","DOI":"10.1109\/TCSVT.2024.3434999","volume":"34","author":"H Zhu","year":"2024","unstructured":"Zhu H, Sui X, Chen B, et al. 2AFC prompting of large multimodal models for image quality assessment. IEEE Trans Circuits Syst Video Technol, 2024, 34: 12873\u201312878","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"4676_CR423","doi-asserted-by":"publisher","first-page":"6403","DOI":"10.1109\/TCSVT.2024.3367904","volume":"34","author":"H Zhu","year":"2024","unstructured":"Zhu H, Chen B, Zhu L, et al. Video quality assessment for spatio-temporal resolution adaptive coding. IEEE Trans Circuits Syst Video Technol, 2024, 34: 6403\u20136415","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"4676_CR424","unstructured":"Zhu H, Chen B, Zhu L, et al. Deepdc: deep distance correlation as a perceptual image quality evaluator. 2022. ArXiv:2211.04927"},{"key":"4676_CR425","doi-asserted-by":"publisher","first-page":"11083","DOI":"10.1109\/TCSVT.2025.3571788","volume":"35","author":"Q Ge","year":"2025","unstructured":"Ge Q, Sun W, Zhang Y, et al. LMM-VQA: advancing video quality assessment with large multimodal models. IEEE Trans Circuits Syst Video Technol, 2025, 35: 11083\u201311096","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"4676_CR426","first-page":"3206","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"H Duan","year":"2025","unstructured":"Duan H, Hu Q, Wang J, et al. Finevq: fine-grained user generated content video quality assessment. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 3206\u20133217"},{"key":"4676_CR427","unstructured":"Jia Z, Zhang Z, Qian J, et al. Vqa2: visual question answering for video quality assessment. 2024. ArXiv:2411.03795"},{"key":"4676_CR428","unstructured":"Jia Z, Zhang Z, Zhang Z, et al. Scaling-up perceptual video quality assessment. 2025. ArXiv:2505.22543"},{"key":"4676_CR429","unstructured":"Cao L, Sun W, Zhang K, et al. Breaking annotation barriers: Generalized video quality assessment via ranking-based self-supervision. 2025. ArXiv:2505.03631"},{"key":"4676_CR430","first-page":"32611","volume":"37","author":"H Zhu","year":"2024","unstructured":"Zhu H, Wu H, Li Y, et al. Adaptive image quality assessment via teaching large multimodal model to compare. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 32611\u201332629","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR431","unstructured":"Zhang X, Li W, Zhao S, et al. Vq-insight: teaching VLMs for AI-generated video quality understanding via progressive visual reinforcement learning. 2025. ArXiv:2506.18564"},{"key":"4676_CR432","unstructured":"Zhou Y, Cao J, Zhang Z, et al. Who is a better talker: subjective and objective quality assessment for AI-generated talking heads. 2025. ArXiv:2507.23343"},{"key":"4676_CR433","doi-asserted-by":"crossref","unstructured":"Zhou Y, Zhang Z, Sun W, et al. Thqa: a perceptual quality assessment database for talking heads. 2024. ArXiv:2404.09003","DOI":"10.1109\/ICIP51287.2024.10647507"},{"key":"4676_CR434","doi-asserted-by":"publisher","first-page":"10047","DOI":"10.1109\/TCSVT.2025.3572000","volume":"35","author":"Y Zhou","year":"2025","unstructured":"Zhou Y, Zhang Z, Jia J, et al. Who is a better imitator: subjective and objective quality assessment of animated humans. IEEE Trans Circuits Syst Video Technol, 2025, 35: 10047\u201310058","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"4676_CR435","doi-asserted-by":"publisher","first-page":"104321","DOI":"10.1016\/j.ipm.2025.104321","volume":"63","author":"Y Zhou","year":"2026","unstructured":"Zhou Y, Zhang Z, Wu S, et al. MI3S: a multimodal large language model assisted quality assessment framework for AI-generated talking heads. Inf Processing Manage, 2026, 63: 104321","journal-title":"Inf Processing Manage"},{"key":"4676_CR436","unstructured":"Zhou Y, Chen Y, Bi K, et al. An implementation of multimodal fusion system for intelligent digital human generation. 2023. ArXiv:2310.20251"},{"key":"4676_CR437","doi-asserted-by":"crossref","unstructured":"Lo C C, Fu S W, Huang W C, et al. Mosnet: deep learning based objective assessment for voice conversion. 2019. ArXiv:1904.08352","DOI":"10.21437\/Interspeech.2019-2003"},{"key":"4676_CR438","first-page":"1","volume-title":"Proceedings of IEEE International Conference on Multimedia and Expo","author":"R E Zezario","year":"2024","unstructured":"Zezario R E, Chen Y W, Fu S W, et al. A study on incorporating whisper for robust speech assessment. In: Proceedings of IEEE International Conference on Multimedia and Expo, 2024. 1\u20136"},{"key":"4676_CR439","first-page":"5386","volume-title":"Proceedings of Interspeech","author":"Z Li","year":"2023","unstructured":"Li Z, Li W. MOSLight: a lightweight data-efficient system for non-intrusive speech quality assessment. In: Proceedings of Interspeech, 2023. 5386\u20135390"},{"key":"4676_CR440","first-page":"391","volume-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing","author":"Y Leng","year":"2021","unstructured":"Leng Y, Tan X, Zhao S, et al. MBNet: MOS prediction for synthesized speech with mean-bias network. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, 2021. 391\u2013395"},{"key":"4676_CR441","first-page":"526","volume-title":"Proceedings of Interspeech","author":"X Liang","year":"2023","unstructured":"Liang X, Cumlin F, Sch\u00fcldt C, et al. DeePMOS: deep posterior mean-opinion-score of speech. In: Proceedings of Interspeech, 2023. 526\u2013530"},{"key":"4676_CR442","first-page":"896","volume-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing","author":"W C Huang","year":"2022","unstructured":"Huang W C, Cooper E, Yamagishi J, et al. LDNet: unified listener dependent modeling in MOS prediction for synthetic speech. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, 2022. 896\u2013900"},{"key":"4676_CR443","doi-asserted-by":"publisher","first-page":"1493","DOI":"10.1109\/TASLPRO.2025.3552925","volume":"33","author":"Q Liang","year":"2025","unstructured":"Liang Q, Shen Y, Chen T, et al. ADTMOS\u2014synthesized speech quality assessment based on audio distortion tokens. IEEE Trans Audio Speech Lang Process, 2025, 33: 1493\u20131507","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"4676_CR444","doi-asserted-by":"crossref","unstructured":"Wang H, Zhao S, Zhou J, et al. Uncertainty-aware mean opinion score prediction. 2024. ArXiv:2408.12829","DOI":"10.21437\/Interspeech.2024-937"},{"key":"4676_CR445","unstructured":"Tjandra A, Wu Y C, Guo B, et al. Meta audiobox aesthetics: Unified automatic quality assessment for speech, music, and sound. 2025. ArXiv:2502.05139"},{"key":"4676_CR446","unstructured":"Ren W, Lin Y C, Huang W C, et al. HighRateMOS: sampling-rate aware modeling for speech quality assessment. 2025. ArXiv:2506.21951"},{"key":"4676_CR447","doi-asserted-by":"crossref","unstructured":"Chiang C H, Wang X, Lin C C, et al. Audio-aware large language models as judges for speaking styles. 2025. ArXiv:2506.05984","DOI":"10.18653\/v1\/2025.findings-emnlp.25"},{"key":"4676_CR448","volume-title":"Proceedings of International Conference on Learning Representations (ICLR)","author":"C Tang","year":"2024","unstructured":"Tang C, Yu W, Sun G, et al. Salmonn: towards generic hearing abilities for large language models. In: Proceedings of International Conference on Learning Representations (ICLR), 2024"},{"key":"4676_CR449","unstructured":"Chu Y, Xu J, Zhou X, et al. Qwen-audio: advancing universal audio understanding via unified large-scale audio-language models. 2023. ArXiv:2311.07919"},{"key":"4676_CR450","unstructured":"Chu Y, Xu J, Yang Q, et al. Qwen2-audio technical report. 2024. ArXiv:2407.10759"},{"key":"4676_CR451","first-page":"1","volume-title":"Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing","author":"S Wang","year":"2025","unstructured":"Wang S, Yu W, Yang Y, et al. Enabling auditory large language models for automatic speech quality evaluation. In: Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, 2025. 1\u20135"},{"key":"4676_CR452","unstructured":"Wang S, Yu W, Chen X, et al. Qualispeech: a speech quality assessment dataset with natural language reasoning and descriptions. 2025. ArXiv:2503.20290"},{"key":"4676_CR453","unstructured":"Wang S, Sz\u00e9kely \u00c9. Evaluating text-to-speech synthesis from a large discrete token-based speech language model. 2024. ArXiv:2405.09768"},{"key":"4676_CR454","unstructured":"Manku R R, Tang Y, Shi X, et al. EmergentTTS-Eval: evaluating TTS models on complex prosodic, expressiveness, and linguistic challenges using model-as-a-judge. 2025. ArXiv:2505.23009"},{"key":"4676_CR455","unstructured":"Huang K, Tu Q, Fan L, et al. InstructTTSEval: benchmarking complex natural-language instruction following in text-to-speech systems. 2025. ArXiv:2506.16381"},{"key":"4676_CR456","unstructured":"Huang W C, Cooper E, Toda T. Mos-bench: benchmarking generalization abilities of subjective speech quality assessment models. 2024. ArXiv:2411.03715"},{"key":"4676_CR457","doi-asserted-by":"publisher","first-page":"7618","DOI":"10.1109\/TCSVT.2022.3186894","volume":"32","author":"Z Zhang","year":"2022","unstructured":"Zhang Z, Sun W, Min X, et al. No-reference quality assessment for 3D colored point cloud and mesh models. IEEE Trans Circuits Syst Video Technol, 2022, 32: 7618\u20137631","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"4676_CR458","doi-asserted-by":"crossref","unstructured":"Zhang Z, Sun W, Min X, et al. Mm-pcqa: multi-modal learning for no-reference point cloud quality assessment. 2022. ArXiv:2209.00244","DOI":"10.24963\/ijcai.2023\/195"},{"key":"4676_CR459","unstructured":"Su S, Cai X, Gao L, et al. Gt23d-bench: a comprehensive general text-to-3d generation benchmark. 2024. ArXiv:2412.09997"},{"key":"4676_CR460","doi-asserted-by":"publisher","first-page":"2129","DOI":"10.1109\/TVCG.2024.3372037","volume":"30","author":"Q Qu","year":"2024","unstructured":"Qu Q, Liang H, Chen X, et al. NeRF-NQA: no-reference quality assessment for scenes generated by NeRF and neural view synthesis methods. IEEE Trans Visual Comput Graphics, 2024, 30: 2129\u20132139","journal-title":"IEEE Trans Visual Comput Graphics"},{"key":"4676_CR461","first-page":"1","volume-title":"Proceedings of 2024 IEEE International Conference on Visual Communications and Image Processing (VCIP)","author":"Y Xing","year":"2024","unstructured":"Xing Y, Yang Q, Yang K, et al. Explicit-nerf-qa: a quality assessment database for explicit nerf model compression. In: Proceedings of 2024 IEEE International Conference on Visual Communications and Image Processing (VCIP), 2024. 1\u20135"},{"key":"4676_CR462","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1109\/QoMEX58391.2023.10178625","volume-title":"Proceedings of 2023 15th International Conference on Quality of Multimedia Experience (QoMEX)","author":"P Martin","year":"2023","unstructured":"Martin P, Rodrigues A, Ascenso J, et al. Nerf-qa: neural radiance fields quality assessment database. In: Proceedings of 2023 15th International Conference on Quality of Multimedia Experience (QoMEX), 2023. 107\u2013110"},{"key":"4676_CR463","unstructured":"Martin P, Rodrigues A, Ascenso J, et al. Nerf view synthesis: subjective quality assessment and objective metrics evaluation. 2024. ArXiv:2405.20078"},{"key":"4676_CR464","doi-asserted-by":"crossref","unstructured":"Martin P, Rodrigues A, Ascenso J, et al. Gs-qa: comprehensive quality assessment benchmark for Gaussian splatting view synthesis. 2025. ArXiv:2502.13196","DOI":"10.1109\/QoMEX65720.2025.11219925"},{"key":"4676_CR465","first-page":"22227","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"T Wu","year":"2024","unstructured":"Wu T, Yang G, Li Z, et al. Gpt-4v (ision) is a human-aligned evaluator for text-to-3D generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024. 22227\u201322238"},{"key":"4676_CR466","first-page":"13326","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"S Duggal","year":"2025","unstructured":"Duggal S, Hu Y, Michel O, et al. Eval3d: Interpretable and fine-grained evaluation for 3d generation. In: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025. 13326\u201313336"},{"key":"4676_CR467","unstructured":"Zhang Y, Zhang M, Wu T, et al. 3dgen-bench: comprehensive benchmark suite for 3d generative models. 2025. ArXiv:2503.21745"},{"key":"4676_CR468","first-page":"1234","volume-title":"Proceedings of the ACM International Conference on Multimedia (ACM MM)","author":"Z Zhang","year":"2024","unstructured":"Zhang Z, Wu H, Zhou Y, et al. LMM-PCQA: assisting point cloud quality assessment with large multimodal models. In: Proceedings of the ACM International Conference on Multimedia (ACM MM), 2024. 1234\u20131243"},{"key":"4676_CR469","doi-asserted-by":"publisher","first-page":"11198","DOI":"10.1145\/3664647.3685520","volume-title":"Proceedings of the 32nd ACM international conference on multimedia","author":"H Duan","year":"2024","unstructured":"Duan H, Yang J, Qiao Y, et al. Vlmevalkit: an open-source toolkit for evaluating large multi-modality models. In: Proceedings of the 32nd ACM international conference on multimedia, 2024. 11198\u201311201"},{"key":"4676_CR470","first-page":"79889","volume":"37","author":"D Jiang","year":"2024","unstructured":"Jiang D, Ku M, Li T, et al. Genai Arena: an open evaluation platform for generative models. In: Proceedings of Advances in Neural Information Processing Systems, 2024. 37: 79889\u201379908","journal-title":"Proceedings of Advances in Neural Information Processing Systems"},{"key":"4676_CR471","doi-asserted-by":"publisher","first-page":"140","DOI":"10.1111\/nyas.15007","volume":"1525","author":"R Bommasani","year":"2023","unstructured":"Bommasani R, Liang P, Lee T. Holistic evaluation of language models. Ann New York Acad Sci, 2023, 1525: 140\u2013146","journal-title":"Ann New York Acad Sci"},{"key":"4676_CR472","unstructured":"White C, Dooley S, Roberts M, et al. Livebench: a challenging, contamination-limited LLM benchmark. 2025. ArXiv:2406.19314"},{"key":"4676_CR473","volume-title":"AI performance on a set of expert-level mathematics problems","author":"Epoch-AI","year":"2025","unstructured":"Epoch-AI. AI performance on a set of expert-level mathematics problems. https:\/\/epoch.ai\/data\/ai-benchmarking-dashboard, 2025"},{"key":"4676_CR474","volume-title":"Large language model leaderboard","author":"AGI-Eval","year":"2025","unstructured":"AGI-Eval. Large language model leaderboard. 2025. https:\/\/agi-eval.cn\/mvp\/listSummaryIndex"},{"key":"4676_CR475","volume-title":"Really reliable live evaluation for LLM","author":"RELM","year":"2025","unstructured":"RELM. Really reliable live evaluation for LLM. 2025. https:\/\/nonelinear.com\/static\/benchmarking.html"},{"key":"4676_CR476","unstructured":"Xu L, Li A, Zhu L, et al. Superclue: a comprehensive Chinese large language model benchmark. 2023. ArXiv:2307.15020"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-025-4676-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-025-4676-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-025-4676-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,20]],"date-time":"2025-11-20T15:10:55Z","timestamp":1763651455000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-025-4676-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,18]]},"references-count":476,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2025,12]]}},"alternative-id":["4676"],"URL":"https:\/\/doi.org\/10.1007\/s11432-025-4676-4","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"value":"1674-733X","type":"print"},{"value":"1869-1919","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,18]]},"assertion":[{"value":"27 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 September 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 November 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 November 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"221301"}}