{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T05:25:09Z","timestamp":1768281909131,"version":"3.49.0"},"reference-count":191,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/access.2025.3649182","type":"journal-article","created":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T18:39:29Z","timestamp":1767033569000},"page":"2690-2725","source":"Crossref","is-referenced-by-count":0,"title":["The Artificial Intelligence Cognitive Examination: A Survey on the Evolution of Multimodal Evaluation From Recognition to Reasoning"],"prefix":"10.1109","volume":"14","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-8987-5935","authenticated-orcid":false,"given":"Mayank","family":"Ravishankara","sequence":"first","affiliation":[{"name":"Independent Researcher, San Francisco, CA, USA"}]},{"given":"Varindra V.","family":"Persad Maharaj","sequence":"additional","affiliation":[{"name":"Independent Researcher, Sunnyvale, CA, USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Holistic evaluation of language models","author":"Liang","year":"2022","journal-title":"arXiv:2211.09110"},{"key":"ref2","article-title":"A survey on benchmarks of multimodal large language models","author":"Li","year":"2024","journal-title":"arXiv:2408.08632"},{"key":"ref3","volume-title":"The 2025 AI Index Report","year":"2025"},{"key":"ref4","first-page":"1","article-title":"Benchmarking neural network robustness to common corruptions and perturbations","volume-title":"Proc. ICLR","author":"Hendrycks"},{"key":"ref5","article-title":"GPQA: A graduate-level Google-proof Q&A benchmark","author":"Rein","year":"2023","journal-title":"arXiv:2311.12022"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-naacl.149"},{"key":"ref7","article-title":"Measuring massive multitask language understanding","author":"Hendrycks","year":"2020","journal-title":"arXiv:2009.03300"},{"key":"ref8","first-page":"1","article-title":"Position: AI evaluation should learn from how we test humans","volume-title":"Proc. 42ndInt. Conf. Mach. Learn. Position Paper Track","author":"Zhuang"},{"key":"ref9","article-title":"GPT-4o technical report","volume-title":"arXiv:2410.21276","author":"Hurst","year":"2024"},{"key":"ref10","volume-title":"Gemini 1.5 Pro: Technical Report","year":"2024"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.1285"},{"key":"ref12","article-title":"Self-consistency improves chain of thought reasoning in language models","author":"Wang","year":"2022","journal-title":"arXiv:2203.11171"},{"key":"ref13","first-page":"4110","article-title":"Dynabench: Rethinking benchmarking in NLP","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics, Hum. Lang. Technol.","author":"Kiela"},{"key":"ref14","article-title":"Judging LLM-as-a-judge with MT-bench and chatbot arena","author":"Zheng","year":"2023","journal-title":"arXiv:2306.05685"},{"key":"ref15","doi-asserted-by":"crossref","DOI":"10.6028\/NIST.AI.100-1","volume-title":"Artificial Intelligence Risk Management Framework (AI RMF 1.0)","author":"Tabassi","year":"2023"},{"key":"ref16","volume-title":"Oecd Principles on Artificial Intelligence","year":"2019"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00257-z"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"ref22","article-title":"Attribution regularization for multimodal paradigms","author":"Yerramilli","year":"2024","journal-title":"arXiv:2404.02359"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref27","article-title":"A corpus for reasoning about natural language grounded in photographs","author":"Suhr","year":"2018","journal-title":"arXiv:1811.00491"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00522"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-2034"},{"key":"ref32","article-title":"MathVista: Evaluating mathematical reasoning of foundation models in visual contexts","author":"Lu","year":"2023","journal-title":"arXiv:2310.02255"},{"key":"ref33","article-title":"MM-Vet: Evaluating large multimodal models for integrated capabilities","author":"Yu","year":"2023","journal-title":"arXiv:2308.02490"},{"key":"ref34","volume-title":"Hallusionbench Project Page","year":"2023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02245"},{"key":"ref36","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"33","author":"Brown"},{"key":"ref37","first-page":"2633","article-title":"Extracting training data from large language models","volume-title":"Proc. USENIX Secur.","author":"Carlini"},{"key":"ref38","article-title":"Investigating data contamination in large language models","author":"Deng","year":"2023","journal-title":"arXiv:2312.09975"},{"key":"ref39","article-title":"RealTime QA: What\u2019s the answer right now?","author":"Kasai","year":"2022","journal-title":"arXiv:2207.13332"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.813"},{"key":"ref41","article-title":"Benchmark data contamination of large language models: A survey","author":"Xu","year":"2024","journal-title":"arXiv:2406.04244"},{"key":"ref42","first-page":"251","article-title":"Habitat 2.0: Training home assistants to rearrange their habitat","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"34","author":"Szot"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00886"},{"key":"ref44","first-page":"477","article-title":"BEHAVIOR: Benchmark for everyday household activities in virtual, interactive, and ecological environments","volume-title":"Proc. Conf. Robot Learn. (CoRL)","author":"Srivastava"},{"key":"ref45","first-page":"84","article-title":"ImageNet classification with deep convolutional neural networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"60","author":"Krizhevsky"},{"key":"ref46","first-page":"125","article-title":"Adversarial examples are not bugs, they are features","volume-title":"Proc. NeurIPS","volume":"32","author":"Ilyas"},{"key":"ref47","first-page":"1","article-title":"ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness","volume-title":"Proc. ICLR","author":"Geirhos"},{"key":"ref48","first-page":"9625","article-title":"From ImageNet to image classification: Contextualizing progress on benchmarks","volume-title":"Proc. 37th Int. Conf. Mach. Learn.","author":"Tsipras"},{"key":"ref49","volume-title":"From Imagenet To Image Classification","year":"2025"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0733-5"},{"key":"ref51","volume-title":"COCO\u2013Common Objects in Context","year":"2025"},{"key":"ref52","first-page":"25050","article-title":"Does progress on imagenet transfer to real-world datasets?","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Fang"},{"key":"ref53","article-title":"Advqa: Adversarial VQA benchmark via model-in-the-loop data collection","author":"Sheng","year":"2021"},{"key":"ref54","article-title":"The clever Hans mirage: A comprehensive survey on spurious correlations in machine learning","author":"Ye","year":"2024","journal-title":"arXiv:2402.12715"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.jacr.2023.06.025"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00380"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00331"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20074-8_9"},{"key":"ref60","first-page":"1","article-title":"GQA-OOD: A benchmark for evaluating generalization of visual question answering models","volume-title":"Proc. Brit. Mach. Vis. Conf.","author":"Kervadec"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"ref62","first-page":"1564","article-title":"Bilinear attention networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Kim"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref64","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Lu"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d16-1204"},{"key":"ref66","first-page":"4069","article-title":"Don\u2019t take the easy way out: Ensemble based methods for debiasing in visual question answering","volume-title":"Proc. 57th Annu. Meeting Assoc. Comput. Linguistics (ACL)","author":"Clark"},{"key":"ref67","first-page":"1","article-title":"HINT: Human importance-aware network tuning","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit. (CVPR)","author":"Selvaraju"},{"key":"ref68","first-page":"1","article-title":"Don\u2019t Take the easy way out: Bias and invariance in natural language inference","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process. (EMNLP)","author":"Clark"},{"key":"ref69","first-page":"839","article-title":"RUBi: Reducing unimodal biases for visual question answering","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Cadene"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.271"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.12"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.540"},{"key":"ref76","first-page":"4967","article-title":"A simple neural network module for relational reasoning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Santoro"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.93"},{"key":"ref78","first-page":"1","article-title":"Neural-symbolic VQA: Disentangling reasoning from vision and language","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yi"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"ref80","first-page":"1","article-title":"Compositional attention networks for machine reasoning","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Hudson"},{"key":"ref81","article-title":"NLVR2 visual bias analysis","author":"Suhr","year":"2019","journal-title":"arXiv:1909.10411"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"ref85","first-page":"552","article-title":"The Winograd schema challenge","volume-title":"Proc. 13th Int. Conf. Princ. Knowl. Represent. Reasoning","author":"Levesque"},{"key":"ref86","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Radford"},{"key":"ref87","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Jia"},{"key":"ref88","first-page":"1","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Alayrac"},{"key":"ref89","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01389"},{"key":"ref92","first-page":"1","article-title":"MMBERT: Multimodal pretraining for vision-and-language understanding","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process.","author":"Khare"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013027"},{"key":"ref94","first-page":"2729","article-title":"Grounded commonsense inference","volume-title":"Proc. 2019 Conf. North Amer. Chapter Assoc. Comput. Linguistics (NAACL)","author":"Zhou"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d18-1009"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1454"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1017\/S0140525X16001837"},{"key":"ref98","first-page":"1","article-title":"Benchmarking multimodal reasoning and representation learning for real-world video understanding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gokhale"},{"key":"ref99","first-page":"33343","article-title":"Learning concept credible models for mitigating shortcuts","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Wang"},{"key":"ref100","article-title":"Efficient unsupervised shortcut learning detection and mitigation in transformers","author":"Kuhn","year":"2025","journal-title":"arXiv:2501.00942"},{"key":"ref101","volume-title":"Compositionality in Computer Vision","year":"2025"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72630-9_3"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00547"},{"key":"ref104","article-title":"Compositional reasoning in vision-language models","author":"Li","year":"2025"},{"key":"ref105","first-page":"29612","article-title":"Vision-language models do not understand negation","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR)","author":"Alhamoud"},{"key":"ref106","first-page":"3707","article-title":"NegVQA: Can vision language models understand negation?","volume-title":"Proc. Findings Assoc. Comput. Linguistics","author":"Zhang"},{"key":"ref107","first-page":"1","article-title":"Interpretable and robust concept reasoning in vision-language models","volume-title":"Proc. AAAI Conf. Artif. Intell.","author":"Trott"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00205"},{"key":"ref109","volume-title":"Paper Page\u2013Multimodal Chain-of-Thought Reasoning: A Comprehensive Survey","year":"2025"},{"key":"ref110","article-title":"HueManity: Probing fine-grained visual perception in MLLMs","author":"Grover","year":"2025","journal-title":"arXiv:2506.03194"},{"key":"ref111","article-title":"CountQA: How well do MLLMs count in the wild?","author":"Tamarapalli","year":"2025","journal-title":"arXiv:2508.06585"},{"key":"ref112","volume-title":"Video-MME Project Page","year":"2024"},{"key":"ref113","article-title":"VCR-bench: A comprehensive evaluation framework for video chain-of-thought reasoning","author":"Qi","year":"2025","journal-title":"arXiv:2504.07956"},{"key":"ref114","article-title":"MMBench: Is your multi-modal model an all-around player?","author":"Liu","year":"2023","journal-title":"arXiv:2307.06281"},{"key":"ref115","article-title":"LVLM-eHub: A comprehensive evaluation benchmark for large vision-language models","author":"Xu","year":"2023","journal-title":"arXiv:2306.09265"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.441"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01075"},{"key":"ref118","article-title":"Learning to navigate in cities without a map","volume-title":"Proc. NeurIPS","author":"Mirowski"},{"key":"ref119","article-title":"Habitat 3.0: A co-habitat for humans, avatars and robots","author":"Puig","year":"2023","journal-title":"arXiv:2310.13724"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00943"},{"key":"ref121","article-title":"MaRVL-QA: A benchmark for mathematical reasoning over visual landscapes","author":"Pande","year":"2025","journal-title":"arXiv:2508.17180"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"ref123","article-title":"PlotQA: Reasoning over scientific plots","author":"Methani","year":"2019","journal-title":"arXiv:1909.13200"},{"key":"ref124","article-title":"FigureQA: An annotated figure dataset for visual reasoning","author":"Kahou","year":"2017","journal-title":"arXiv:1710.07300"},{"key":"ref125","volume-title":"Polymath: A Challenging Multi-Modal Mathematical Reasoning Benchmark","author":"Gupta","year":"2024"},{"key":"ref126","first-page":"12697","article-title":"Calibrate before use: Improving few-shot performance of language models","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Zhao"},{"key":"ref127","article-title":"Quantifying language models\u2019 sensitivity to spurious features in prompt design or: How I learned to start worrying about prompt formatting","author":"Sclar","year":"2023","journal-title":"arXiv:2310.11324"},{"key":"ref128","volume-title":"MM-VET: Code and Data","year":"2023"},{"key":"ref129","volume-title":"MM-VET Online Evaluator","year":"2023"},{"key":"ref130","first-page":"46595","article-title":"Judging LLM-as-a-judge with MT-bench and chatbot arena","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Zheng"},{"key":"ref131","article-title":"MM-REACT: Prompting ChatGPT for multimodal reasoning and action","author":"Yang","year":"2023","journal-title":"arXiv:2303.11381"},{"key":"ref132","article-title":"A survey on LLM-as-a-judge","author":"Gu","year":"2024","journal-title":"arXiv:2411.15594"},{"key":"ref133","first-page":"1","article-title":"Quantifying memorization across neural language models","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Carlini"},{"key":"ref134","article-title":"SEED-bench: Benchmarking multimodal LLMs with generative comprehension","author":"Li","year":"2023","journal-title":"arXiv:2307.16125"},{"key":"ref135","volume-title":"Multi-Modality-Arena (LVLM-EHUB Platform)","year":"2023"},{"key":"ref136","article-title":"MMMU-pro: A more robust multi-discipline multimodal understanding benchmark","author":"Yue","year":"2024","journal-title":"arXiv:2409.02813"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-emnlp.1284"},{"key":"ref138","article-title":"Open-ended VQA benchmarking of vision-language models by exploiting classification datasets and their semantic hierarchy","author":"Ging","year":"2024","journal-title":"arXiv:2402.07270"},{"key":"ref139","article-title":"An empirical study of LLM-as-a-judge: How design choices impact evaluation reliability","author":"Yamauchi","year":"2025","journal-title":"arXiv:2506.13639"},{"issue":"6","key":"ref140","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3355390","article-title":"A survey on deep learning for video captioning","volume":"52","author":"Aafaq","year":"2019","journal-title":"ACM Comput. Surveys"},{"key":"ref141","volume-title":"Cola: Compose Objects Localized With Attributes","year":"2025"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/tbdata.2025.3536930"},{"key":"ref143","volume-title":"Ailab-CVC\/Seed-Bench","year":"2023"},{"key":"ref144","first-page":"1","article-title":"Seed-bench: Benchmarking multimodal llms with generative comprehension","volume-title":"Proc. CVPR","author":"Li"},{"key":"ref145","first-page":"1","article-title":"Winoground: Probing multimodal reasoning through grounded contrastive sentences","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguistics","author":"Thrush"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2008.4587784"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_3"},{"key":"ref148","volume-title":"Paper Page\u2013VCR-Bench: A Comprehensive Evaluation Framework for Video Chain-of-Thought Reasoning","year":"2025"},{"key":"ref149","first-page":"1","article-title":"The curious case of neural text degeneration","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Holtzman"},{"key":"ref150","article-title":"Raise: A framework for responsible ai scoring and evaluation","author":"Liao","year":"2024"},{"key":"ref151","article-title":"Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned","author":"Ganguli","year":"2022","journal-title":"arXiv:2209.07858"},{"key":"ref152","volume-title":"Levels of AGI: Operationalizing Progress on the Path to AGI","year":"2025"},{"key":"ref153","first-page":"1","article-title":"AI2-THOR: An interactive 3D environment for visual AI","volume-title":"Proc. 5th Workshop AI Social Good","author":"Kolve"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2018.00387"},{"key":"ref155","article-title":"Muep: A unified evaluation platform for multimodal large language models","author":"Li","year":"2024","journal-title":"arxiv: 2406.14234"},{"key":"ref156","article-title":"EmbodiedBench: Comprehensive benchmarking multi-modal large language models for vision-driven embodied agents","author":"Yang","year":"2025","journal-title":"arXiv:2502.09560"},{"issue":"1","key":"ref157","first-page":"220","article-title":"AI guide dog: Egocentric path prediction on smartphone","volume-title":"Proc. AAAI Symp. Ser.","volume":"5","author":"Jadhav"},{"issue":"1","key":"ref158","first-page":"130","article-title":"Better apprenticeship learning with LLM explanations","volume-title":"Proc. AAAI Symp. Ser.","volume":"4","author":"Grover"},{"key":"ref159","article-title":"MAEA: Multimodal attribution for embodied AI","author":"Jain","year":"2023","journal-title":"arXiv:2307. 13850"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.3390\/s25020477"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00901"},{"key":"ref162","first-page":"2776","article-title":"Benchmarking language model creativity: A case study on code generation","volume-title":"Proc. Conf. Nations Americas Chapter Assoc. Comput. Linguistics, Human Lang. Technol.","author":"Lu"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.684"},{"key":"ref164","article-title":"Evaluating and modeling social intelligence: A comparative study of human and ai capabilities","author":"Wang","year":"2024","journal-title":"arXiv:2405. 11841"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.18608\/jla.2025.8571"},{"key":"ref166","article-title":"AI and the everything in the whole wide world benchmark","volume-title":"Proc. FACCT","author":"Raji"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2024.121482"},{"key":"ref168","first-page":"1","article-title":"A comprehensive survey on multi-view classification: Methods, applications, and challenges","volume":"109","author":"Fang","year":"2024","journal-title":"Inf. Fusion"},{"issue":"164","key":"ref169","first-page":"1","article-title":"Improving reproducibility in machine learning research (Neurips 2019 reproducibility program)","volume":"22","author":"Pineau","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.225"},{"key":"ref171","article-title":"Livebench: A continuous, dynamic benchmark for evaluating LLMs","author":"White","year":"2024"},{"key":"ref172","volume-title":"Semantic Versioning 2.0.0","year":"2013"},{"key":"ref173","article-title":"MMLU-pro: A more robust and challenging multi-task language understanding benchmark","author":"Wang","year":"2024","journal-title":"arXiv:2406.01574"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.656"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.trustnlp-main.23"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1145\/3458723"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1145\/3287560.3287596"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00041"},{"key":"ref179","first-page":"20071","article-title":"A Whac-A-Mole dilemma: Shortcuts come in multiples where mitigating one amplifies others","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR)","author":"Li"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.235"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1609\/aies.v8i1.36595"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2025\/1153"},{"key":"ref183","volume-title":"A New Era of Intelligence With Gemini 3","year":"2025"},{"key":"ref184","volume-title":"Humanity\u2019s Last Exam: Benchmarking the Limits of AI","year":"2025"},{"key":"ref185","volume-title":"Frontiermath: Evaluating Advanced Mathematical Reasoning in AI","year":"2024"},{"key":"ref186","volume-title":"OpenAI O3 System Card","year":"2025"},{"issue":"8","key":"ref187","first-page":"1","article-title":"Video generation models as world simulators","volume":"1","author":"Brooks","year":"2024","journal-title":"OpenAI Blog"},{"key":"ref188","volume-title":"Evaluating Ai Language Models Just Got More Effective and Efficient","year":"2025"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412739"},{"key":"ref190","volume-title":"Embodied Symbiotic Assistants That See, Act, Infer and Chat","year":"2023"},{"key":"ref191","article-title":"Semantic augmentation in images using language","author":"Yerramilli","year":"2024","journal-title":"arXiv:2404.02353"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/11323511\/11317986.pdf?arnumber=11317986","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T22:03:36Z","timestamp":1768255416000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11317986\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":191,"URL":"https:\/\/doi.org\/10.1109\/access.2025.3649182","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}