{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T12:51:01Z","timestamp":1777380661494,"version":"3.51.4"},"reference-count":415,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T00:00:00Z","timestamp":1772496000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Array"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.array.2026.100739","type":"journal-article","created":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T17:25:30Z","timestamp":1772731530000},"page":"100739","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["A systematic review of vision language models: Comprehensive analysis of architectures, applications, datasets and challenges towards robust multimodal intelligence"],"prefix":"10.1016","volume":"30","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-6718-8869","authenticated-orcid":false,"given":"Arifur","family":"Rahman","sequence":"first","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.array.2026.100739_b1","first-page":"69925","article-title":"VisionLLM v2: An end-to-end generalist multimodal large language model for hundreds of vision-language tasks","volume":"37","author":"Wu","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b2","series-title":"LMOD: A large multimodal ophthalmology dataset and benchmark for large vision-language models","author":"Qin","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b3","series-title":"VisualWebInstruct: Scaling up multimodal instruction data through web search","author":"Jia","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b4","series-title":"International conference on machine learning","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.array.2026.100739_b5","doi-asserted-by":"crossref","unstructured":"Kayser Maxime, et al. E-ViL: A dataset and benchmark for natural language explanations in vision-language tasks. In: Proceedings of the IEEE\/CVF international conference on computer vision. 2021.","DOI":"10.1109\/ICCV48922.2021.00128"},{"key":"10.1016\/j.array.2026.100739_b6","first-page":"34892","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b7","series-title":"CoCa: Contrastive captioners are image-text foundation models","author":"Yu","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b8","series-title":"Microsoft COCO captions: Data collection and evaluation server","author":"Chen","year":"2015"},{"key":"10.1016\/j.array.2026.100739_b9","doi-asserted-by":"crossref","DOI":"10.3389\/frai.2024.1430984","article-title":"Vision-language models for medical report generation and visual question answering: A review","volume":"7","author":"Hartsock","year":"2024","journal-title":"Front Artif Intell"},{"key":"10.1016\/j.array.2026.100739_b10","doi-asserted-by":"crossref","unstructured":"Goyal Yash, et al. Making the V in VQA matter: Elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 2017.","DOI":"10.1109\/CVPR.2017.670"},{"key":"10.1016\/j.array.2026.100739_b11","series-title":"MIMIC-ext-MIMIC-CXR-vqa: A complex, diverse, and large-scale visual question answering dataset for chest X-ray images","author":"Bae","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b12","series-title":"Med3DVLM: An efficient vision-language model for 3D medical image analysis","author":"Xin","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b13","series-title":"2024 IEEE\/ACM Conference on Connected Health: Applications, Systems and Engineering Technologies","article-title":"On large visual language models for medical imaging analysis: An empirical study","author":"Van","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b14","series-title":"2025 IEEE\/CVF winter conference on applications of computer vision","article-title":"COVLA: Comprehensive vision-language-action dataset for autonomous driving","author":"Arai","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b15","doi-asserted-by":"crossref","unstructured":"Marathe Aboli, et al. WEDGE: A multi-weather autonomous driving dataset built from generative vision-language models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2023.","DOI":"10.1109\/CVPRW59228.2023.00334"},{"key":"10.1016\/j.array.2026.100739_b16","doi-asserted-by":"crossref","unstructured":"Ma Yunsheng, et al. Lampilot: An open benchmark dataset for autonomous driving with language model programs. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.01434"},{"key":"10.1016\/j.array.2026.100739_b17","article-title":"Vision-language models in remote sensing: Current progress and future trends","author":"Li","year":"2024","journal-title":"IEEE Geosci Remote Sens Mag"},{"key":"10.1016\/j.array.2026.100739_b18","doi-asserted-by":"crossref","first-page":"64","DOI":"10.1016\/j.isprsjprs.2025.01.020","article-title":"SkyEyeGPT: Unifying remote sensing vision-language tasks via instruction tuning with large language model","volume":"221","author":"Zhan","year":"2025","journal-title":"ISPRS J Photogramm Remote Sens"},{"key":"10.1016\/j.array.2026.100739_b19","series-title":"VRSBench: A versatile vision-language benchmark dataset for remote sensing image understanding","author":"Li","year":"2024"},{"issue":"4","key":"10.1016\/j.array.2026.100739_b20","doi-asserted-by":"crossref","first-page":"719","DOI":"10.3390\/rs17040719","article-title":"DDFAV: Remote sensing large vision language models dataset and evaluation benchmark","volume":"17","author":"Li","year":"2025","journal-title":"Remote Sens"},{"key":"10.1016\/j.array.2026.100739_b21","series-title":"Multimodal arxiv: A dataset for improving scientific comprehension of large vision-language models","author":"Li","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b22","series-title":"X-lxmert: Paint, caption and answer questions with multi-modal transformers","author":"Cho","year":"2020"},{"key":"10.1016\/j.array.2026.100739_b23","series-title":"GMAI-VL & GMAI-VL-5.5 m: A large vision-language model and a comprehensive multimodal dataset towards general medical AI","author":"Li","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b24","doi-asserted-by":"crossref","unstructured":"Kuckreja Kartik, et al. GeoChat: Grounded large vision-language model for remote sensing. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.02629"},{"key":"10.1016\/j.array.2026.100739_b25","series-title":"International conference on machine learning","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b26","series-title":"Beit: Bert pre-training of image transformers","author":"Bao","year":"2021"},{"key":"10.1016\/j.array.2026.100739_b27","first-page":"2","article-title":"A review on vision-language-based approaches: Challenges and applications","volume":"82","author":"Ho","year":"2025","journal-title":"Comput Mater Contin"},{"key":"10.1016\/j.array.2026.100739_b28","series-title":"POINTS1.5: Building a vision-language model towards real world applications","author":"Liu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b29","series-title":"ISARC. Proceedings of the International Symposium on Automation and Robotics in Construction","article-title":"VL-con: Vision-language dataset for deep learning-based construction monitoring applications","volume":"Vol. 41","author":"Hsu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b30","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b31","series-title":"International conference on machine learning","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b32","series-title":"European conference on computer vision","article-title":"Uniter: Universal image-text representation learning","author":"Chen","year":"2020"},{"key":"10.1016\/j.array.2026.100739_b33","first-page":"25278","article-title":"LAION-5B: An open large-scale dataset for training next generation image-text models","volume":"35","author":"Schuhmann","year":"2022","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b34","first-page":"1","article-title":"ChatEarthNet: A global-scale image-text dataset empowering vision-language geo-foundation models","volume":"2024","author":"Yuan","year":"2024","journal-title":"Earth Syst Sci Data Discuss"},{"key":"10.1016\/j.array.2026.100739_b35","doi-asserted-by":"crossref","first-page":"35959","DOI":"10.52202\/068431-2606","article-title":"PyramidCLIP: Hierarchical feature alignment for vision-language model pretraining","volume":"35","author":"Gao","year":"2022","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b36","series-title":"MiniGPT-v2: Large language model as a unified interface for vision-language multi-task learning","author":"Chen","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b37","first-page":"1890","article-title":"Voila-A: Aligning vision-language models with user\u2019s gaze attention","volume":"37","author":"Yan","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b38","doi-asserted-by":"crossref","first-page":"17972","DOI":"10.52202\/079017-0571","article-title":"SugarCrepe++ dataset: Vision-language model sensitivity to semantic and lexical alterations","volume":"37","author":"Dumpala","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b39","doi-asserted-by":"crossref","unstructured":"Xu Jiarui, et al. Groupvit: Semantic segmentation emerges from text supervision. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"10.1016\/j.array.2026.100739_b40","doi-asserted-by":"crossref","unstructured":"Ding Jian, et al. Decoupling zero-shot semantic segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"10.1016\/j.array.2026.100739_b41","doi-asserted-by":"crossref","unstructured":"Zhang Pengchuan, et al. VinVL: Revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2021.","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"10.1016\/j.array.2026.100739_b42","series-title":"Medvlm-r1: Incentivizing medical reasoning capability of vision-language models (vlms) via reinforcement learning","author":"Pan","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b43","series-title":"SATIN: A multi-task metadataset for classifying satellite imagery using vision-language models","author":"Roberts","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b44","doi-asserted-by":"crossref","DOI":"10.1109\/ACCESS.2025.3552613","article-title":"A novel dataset for polyp segmentation and detection using a vision-language model","author":"Alilou","year":"2025","journal-title":"IEEE Access"},{"key":"10.1016\/j.array.2026.100739_b45","doi-asserted-by":"crossref","unstructured":"L\u00fcddecke Timo, Ecker Alexander. Image segmentation using text and image prompts. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"10.1016\/j.array.2026.100739_b46","series-title":"European conference on computer vision","article-title":"Scaling open-vocabulary image segmentation with image-level labels","author":"Ghiasi","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b47","doi-asserted-by":"crossref","unstructured":"Sun Zeyi, et al. Alpha-clip: A clip model focusing on wherever you want. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.01237"},{"key":"10.1016\/j.array.2026.100739_b48","series-title":"Meta clip 2: A worldwide scaling recipe","author":"Chuang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b49","doi-asserted-by":"crossref","unstructured":"Li Liunian Harold, et al. Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"10.1016\/j.array.2026.100739_b50","first-page":"32897","article-title":"VLMO: Unified vision-language pre-training with mixture-of-modality-experts","volume":"35","author":"Bao","year":"2022","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b51","doi-asserted-by":"crossref","unstructured":"Girdhar Rohit, et al. Imagebind: One embedding space to bind them all. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2023.","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"10.1016\/j.array.2026.100739_b52","doi-asserted-by":"crossref","unstructured":"Xu Hu, et al. VideoCLIP: Contrastive pre-training for zero-shot video-text understanding. In: Proceedings of the 2021 conference on empirical methods in natural language processing. 2021.","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"10.1016\/j.array.2026.100739_b53","first-page":"10078","article-title":"VideoMAE: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume":"35","author":"Tong","year":"2022","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b54","series-title":"The dawn of lmms: Preliminary explorations with gpt-4v (ision)","author":"Yang","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b55","series-title":"Llava-o1: Let vision language models reason step-by-step","author":"Xu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b56","series-title":"European conference on computer vision","article-title":"Llava-plus: Learning to use tools for creating multimodal agents","author":"Liu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b57","series-title":"Sensitivity of generative VLMs to semantically and lexically altered prompts","author":"Dumpala","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b58","series-title":"Vision-language foundation models as effective robot imitators","author":"Li","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b59","first-page":"87874","article-title":"What matters when building vision-language models?","volume":"37","author":"Lauren\u00e7on","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b60","series-title":"PaLI: A jointly-scaled multilingual language-image model","author":"Chen","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b61","series-title":"Qwen-VL: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b62","article-title":"Fuyu-8B: A multimodal architecture for AI agents","volume":"3","author":"Bavishi","year":"2023","journal-title":"Adept AI Blog"},{"key":"10.1016\/j.array.2026.100739_b63","series-title":"Sphinx: The joint mixing of weights, tasks, and visual embeddings for multi-modal large language models","author":"Lin","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b64","doi-asserted-by":"crossref","unstructured":"Piergiovanni A J, et al. Mirasol3B: A multimodal autoregressive model for time-aligned and contextual modalities. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.02531"},{"key":"10.1016\/j.array.2026.100739_b65","series-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b66","first-page":"121475","article-title":"CogVLM: Visual expert for pretrained language models","volume":"37","author":"Wang","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b67","series-title":"Ferret-V2: An improved baseline for referring and grounding with large language models","author":"Zhang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b68","doi-asserted-by":"crossref","unstructured":"Wang Zhaoqing, et al. Lavin-DIT: Large Vision Diffusion Transformer. In: Proceedings of the computer vision and pattern recognition conference. 2025.","DOI":"10.1109\/CVPR52734.2025.01868"},{"key":"10.1016\/j.array.2026.100739_b69","series-title":"Palm-e: An embodied multimodal language model","author":"Driess","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b70","article-title":"InstructBLIP: A general-purpose vision-language model enhanced via instruction tuning","author":"Dai","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b71","first-page":"72096","article-title":"Language is not all you need: Aligning perception with language models","volume":"36","author":"Huang","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b72","series-title":"Kosmos-2: Grounding multimodal large language models to the world","author":"Peng","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b73","doi-asserted-by":"crossref","unstructured":"Xu Zhiyang, Shen Ying, Huang Lifu. Multiinstruct: Improving multi-modal zero-shot learning via instruction tuning. In: Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: long papers). 2023.","DOI":"10.18653\/v1\/2023.acl-long.641"},{"key":"10.1016\/j.array.2026.100739_b74","series-title":"F-VLM: Open-vocabulary object detection upon frozen vision and language models","author":"Kuo","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b75","series-title":"COVLM: Composing visual entities and relationships in large language models via communicative decoding","author":"Li","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b76","series-title":"International conference on machine learning","article-title":"MPLUG-2: A modularized multi-modal foundation model across text, image, and video","author":"Xu","year":"2023"},{"issue":"5","key":"10.1016\/j.array.2026.100739_b77","doi-asserted-by":"crossref","first-page":"3156","DOI":"10.1109\/TPAMI.2023.3339661","article-title":"X22-VLM: All-in-one pre-trained model for vision-language tasks","volume":"46","author":"Zeng","year":"2023","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10.1016\/j.array.2026.100739_b78","series-title":"EPIA conference on artificial intelligence","article-title":"Evaluation of lyrics extraction from folk music sheets using vision language models (VLMs)","author":"Sales Mendes","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b79","series-title":"Prism: Projection-based reward integration for scene-aware real-to-sim-to-real transfer with few demonstrations","author":"Sun","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b80","doi-asserted-by":"crossref","unstructured":"Zhang Xinsong, et al. Toward building general foundation models for language, vision, and vision-language understanding tasks. In: Findings of the association for computational linguistics: EMNLP 2023. 2023.","DOI":"10.18653\/v1\/2023.findings-emnlp.40"},{"key":"10.1016\/j.array.2026.100739_b81","series-title":"MM-React: Prompting ChatGPT for multimodal reasoning and action","author":"Yang","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b82","doi-asserted-by":"crossref","unstructured":"Yang Zhengyuan, et al. An empirical study of GPT-3 for few-shot knowledge-based VQA. In: Proceedings of the AAAI conference on artificial intelligence. 36, (3). 2022.","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"10.1016\/j.array.2026.100739_b83","doi-asserted-by":"crossref","unstructured":"Tiong Anthony Meng Huat, et al. Plug-and-play VQA: Zero-shot VQA by conjoining large pretrained models with zero training. In: Findings of the association for computational linguistics: EMNLP 2022. 2022.","DOI":"10.18653\/v1\/2022.findings-emnlp.67"},{"key":"10.1016\/j.array.2026.100739_b84","series-title":"Simvlm: Simple visual language model pretraining with weak supervision","author":"Wang","year":"2021"},{"key":"10.1016\/j.array.2026.100739_b85","series-title":"TinyGPT-V: Efficient multimodal large language model via small backbones","author":"Yuan","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b86","first-page":"71995","article-title":"GPT4Tools: Teaching large language model to use tools via self-instruction","volume":"36","author":"Yang","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b87","doi-asserted-by":"crossref","unstructured":"Ye Qinghao, et al. MPLUG-OWL2: Revolutionizing multi-modal large language model with modality collaboration. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.01239"},{"key":"10.1016\/j.array.2026.100739_b88","series-title":"M3IT: A large-scale dataset towards multi-modal multilingual instruction tuning","author":"Li","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b89","series-title":"European conference on computer vision","article-title":"Llama-Vid: An image is worth 2 tokens in large language models","author":"Li","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b90","series-title":"Video-LLaMA: An instruction-tuned audio-visual language model for video understanding","author":"Zhang","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b91","series-title":"ChatBridge: Bridging modalities with large language model as a language catalyst","author":"Zhao","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b92","doi-asserted-by":"crossref","unstructured":"Maaz Muhammad, et al. Video-ChatGPT: Towards detailed video understanding via large vision and language models. In: Proceedings of the 62nd annual meeting of the association for computational linguistics (Volume 1: Long papers). 2024.","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"10.1016\/j.array.2026.100739_b93","article-title":"VALOR: Vision-audio-language omni-perception pretraining model and dataset","author":"Liu","year":"2024","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10.1016\/j.array.2026.100739_b94","series-title":"Macaw-LLM: Multi-modal language modeling with image, audio, video, and text integration","author":"Lyu","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b95","series-title":"Pandagpt: One model to instruction-follow them all","author":"Su","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b96","first-page":"16083","article-title":"Any-to-any generation via composable diffusion","volume":"36","author":"Tang","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b97","doi-asserted-by":"crossref","unstructured":"Tang Zineng, et al. Codi-2: In-context interleaved and interactive any-to-any generation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.02589"},{"key":"10.1016\/j.array.2026.100739_b98","series-title":"Gemini robotics: Bringing ai into the physical world","author":"Team","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b99","series-title":"Videopoet: A large language model for zero-shot video generation","author":"Kondratyuk","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b100","series-title":"Conference on robot learning","article-title":"Cliport: What and where pathways for robotic manipulation","author":"Shridhar","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b101","series-title":"A generalist agent","author":"Reed","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b102","series-title":"R3m: A universal visual representation for robot manipulation","author":"Nair","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b103","series-title":"Rt-1: Robotics transformer for real-world control at scale","author":"Brohan","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b104","series-title":"Inner monologue: Embodied reasoning through planning with language models","author":"Huang","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b105","series-title":"Latte: Language trajectory transformer","author":"Bucker","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b106","series-title":"Conference on Robot Learning","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","author":"Zitkovich","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b107","series-title":"Instruct2act: Mapping multi-modality instructions to robotic actions with large language model","author":"Huang","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b108","series-title":"Vima: General robot manipulation with multimodal prompts","first-page":"6","author":"Jiang","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b109","first-page":"22304","article-title":"Compositional foundation models for hierarchical planning","volume":"36","author":"Ajay","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b110","series-title":"Instruction-following agents with multimodal transformer","author":"Liu","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b111","series-title":"International conference on machine learning","article-title":"Liv: Language-image representations and rewards for robotic control","author":"Ma","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b112","series-title":"Se (3)-diffusionfields: Learning smooth cost functions for joint grasp and motion optimization through diffusion","author":"Urain","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b113","series-title":"Vip: Towards universal visual reward and representation via value-implicit pre-training","author":"Ma","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b114","first-page":"9156","article-title":"Learning universal policies via text-guided video generation","volume":"36","author":"Du","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b115","doi-asserted-by":"crossref","unstructured":"Bahl Shikhar, et al. Affordances from human videos as a versatile representation for robotics. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2023.","DOI":"10.1109\/CVPR52729.2023.01324"},{"key":"10.1016\/j.array.2026.100739_b116","series-title":"Structdiffusion: Language-guided creation of physically-valid structures using unseen objects","author":"Liu","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b117","series-title":"Genaug: Retargeting behaviors to unseen situations via generative augmentation","author":"Chen","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b118","series-title":"Zero-shot robotic manipulation with pretrained image-editing diffusion models","author":"Black","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b119","series-title":"Scaling robot learning with semantically imagined experience","author":"Yu","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b120","series-title":"2024 IEEE International Conference on Robotics and Automation","article-title":"Nomad: Goal masked diffusion policies for navigation and exploration","author":"Sridhar","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b121","series-title":"Cacti: A framework for scalable multi-task multi-scene visual imitation learning","author":"Mandi","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b122","series-title":"An embodied generalist agent in 3d world","author":"Huang","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b123","series-title":"Octo: An open-source generalist robot policy","author":"Team","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b124","series-title":"2025 IEEE International Conference on Robotics and Automation","article-title":"Robot utility models: General policies for zero-shot deployment in new environments","author":"Etukuru","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b125","series-title":"Robomm: All-in-one multimodal large model for robotic manipulation","first-page":"5","author":"Yan","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b126","series-title":"Llara: Supercharging robot learning data for vision-language policy","author":"Li","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b127","series-title":"Robotic control via embodied chain-of-thought reasoning","author":"Zawalski","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b128","series-title":"2024 IEEE International Conference on Systems, Man, and Cybernetics","article-title":"Bi-vla: Vision-language-action model-based system for bimanual robotic dexterous manipulations","author":"Gbagbe","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b129","series-title":"Scaling cross-embodied learning: One policy for manipulation, navigation, locomotion and aviation","author":"Doshi","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b130","series-title":"Mobility vla: Multimodal instruction navigation with long-context vlms and topological graphs","author":"Chiang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b131","series-title":"Unleashing large-scale video generative pre-training for visual robot manipulation","author":"Wu","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b132","series-title":"Actra: Optimized transformer architecture for vision-language-action models in robot learning","author":"Ma","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b133","first-page":"56619","article-title":"Deer-vla: Dynamic inference of multimodal large language models for efficient robot execution","volume":"37","author":"Yue","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b134","series-title":"Gr-2: A generative video-language-action model with web-scale knowledge for robot manipulation","author":"Cheang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b135","series-title":"2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems","article-title":"Robonurse-vla: Robotic scrub nurse system based on vision-language-action model","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b136","series-title":"Proceedings of the AAAI conference on artificial intelligence","article-title":"Film: Visual reasoning with a general conditioning layer","volume":"vol. 32. no. 1","author":"Perez","year":"2018"},{"key":"10.1016\/j.array.2026.100739_b137","doi-asserted-by":"crossref","first-page":"1684","DOI":"10.1177\/02783649241273668","article-title":"Diffusion policy: Visuomotor policy learning via action diffusion","author":"Chi","year":"2025","journal-title":"Int J Robot Res 44.10-11"},{"key":"10.1016\/j.array.2026.100739_b138","series-title":"Yell at your robot: Improving on-the-fly from language corrections","author":"Shi","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b139","series-title":"Nikolaos gkanatsios, and katerina fragkiadaki\u201d3d diffuser actor: Policy diffusion with 3d scene representations","author":"Ke","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b140","series-title":"3D-vla: A 3d vision-language-action generative world model","author":"Zhen","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b141","series-title":"Learning to act from actionless videos through dense correspondences","author":"Ko","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b142","series-title":"Rekep: Spatio-temporal reasoning of relational keypoint constraints for robotic manipulation","author":"Huang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b143","series-title":"Multimodal diffusion transformer: Learning versatile behavior from multimodal goals","author":"Reuss","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b144","series-title":"Cogact: A foundational vision-language-action model for synergizing cognition and action in robotic manipulation","author":"Li","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b145","series-title":"RoboPoint: A vision-language model for spatial affordance prediction for robotics","author":"Yuan","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b146","first-page":"17541","article-title":"Peria: Perceive, reason, imagine, act via holistic language and vision planning for manipulation","volume":"37","author":"Ni","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b147","first-page":"20198","article-title":"Grounding multimodal large language models in actions","volume":"37","author":"Szot","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b148","series-title":"Improving vision-language-action models via chain-of-affordance","author":"Li","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b149","series-title":"European Conference on Computer Vision","article-title":"Track2act: Predicting point tracks from internet videos enables generalizable robot manipulation","author":"Bharadhwaj","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b150","first-page":"655","article-title":"Where are we in the search for an artificial visual cortex for embodied intelligence?","volume":"36","author":"Majumdar","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b151","series-title":"\u03c00: A vision-language-action flow model for general robot control","author":"Black","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b152","series-title":"Edgevla: Efficient vision-language-action models","author":"Budzianowski","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b153","first-page":"40085","article-title":"Robomamba: Efficient vision-language-action model for robotic reasoning and manipulation","volume":"37","author":"Liu","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b154","series-title":"Vision-language models provide promptable representations for reinforcement learning","author":"Chen","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b155","series-title":"Adaptive language-guided abstraction from contrastive explanations","author":"Peng","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b156","series-title":"Grape: Generalizing robot policy via preference alignment","author":"Zhang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b157","series-title":"Rldg: Robotic generalist policy distillation via reinforcement learning","author":"Xu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b158","series-title":"A dual process vla: Efficient robotic manipulation leveraging vlm","author":"Han","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b159","series-title":"Up-vla: A unified understanding and prediction model for embodied agent","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b160","series-title":"Rdt1b: a diffusion foundation model for bimanual manipulation","author":"Liu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b161","doi-asserted-by":"crossref","unstructured":"Zheng Jinliang, et al. Universal actions for enhanced embodied foundation models. In: Proceedings of the Computer Vision and Pattern Recognition Conference. 2025.","DOI":"10.1109\/CVPR52734.2025.02096"},{"key":"10.1016\/j.array.2026.100739_b162","series-title":"Time-unified diffusion policy with action discrimination for robotic manipulation","author":"Niu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b163","series-title":"Humanoid-vla: Towards universal humanoid control with visual integration","author":"Ding","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b164","series-title":"CDP: Towards robust autoregressive visuomotor policy learning via causal diffusion","author":"Ma","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b165","series-title":"Nora: A small open-sourced generalist vision language action model for embodied tasks","author":"Hung","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b166","series-title":"Discrete diffusion vla: Bringing discrete diffusion to action decoding in vision-language-action policies","author":"Liang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b167","series-title":"OneTwoVLA: A unified vision-language-action model with adaptive reasoning","author":"Lin","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b168","series-title":"Task reconstruction and extrapolation for \u03c00 using text latent","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b169","series-title":"Vote: vision-language-action optimization with trajectory ensemble voting","author":"Lin","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b170","series-title":"Dita: Scaling diffusion transformer for generalist vision-language-action policy","author":"Hou","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b171","series-title":"Univla: Learning to act anywhere with task-centric latent actions","author":"Bu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b172","series-title":"Diffusion-VLA: Generalizable and interpretable robot foundation model via self-generated reasoning","author":"Wen","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b173","series-title":"Unveiling the potential of vision-language-action models with open-ended multimodal instructions","author":"Zhao","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b174","series-title":"ForceVLA: Enhancing VLA models with a force-aware MoE for contact-rich manipulation","author":"Yu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b175","series-title":"2025 20th ACM\/IEEE International Conference on Human-Robot Interaction","article-title":"UavVLA: Vision-language-action system for large scale aerial mission generation","author":"Sautenkov","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b176","doi-asserted-by":"crossref","DOI":"10.1109\/LRA.2025.3544909","article-title":"Tinyvla: Towards fast, data-efficient vision-language-action models for robotic manipulation","author":"Wen","year":"2025","journal-title":"IEEE Robot Autom Lett"},{"key":"10.1016\/j.array.2026.100739_b177","series-title":"TactileVLA: unlocking vision-language-action model\u2019s physical knowledge for tactile generalization","author":"Huang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b178","series-title":"Smolvla: A vision-language-action model for affordable and efficient robotics","author":"Shukor","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b179","series-title":"2025 20th ACM\/IEEE International Conference on Human-Robot Interaction","article-title":"Shake-vla: Vision-language-action model-based system for bimanual robotic manipulations and liquid mixing","author":"Khan","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b180","series-title":"VQ-VLA: Improving vision-language-action models via scaling vector-quantized action tokenizers","author":"Wang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b181","series-title":"Hi robot: Open-ended instruction following with hierarchical vision-language-action models","author":"Shi","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b182","series-title":"DexVLG: Dexterous vision-language-grasp model at scale","author":"He","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b183","series-title":"Dexgraspvla: A vision-language-action framework towards general dexterous grasping","author":"Zhong","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b184","series-title":"AC-DiT: Adaptive coordination diffusion transformer for mobile manipulation","author":"Chen","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b185","series-title":"Hamster: Hierarchical action models for open-world robot manipulation","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b186","series-title":"Dexvla: Vision-language model with plug-in diffusion expert for general robot control","author":"Wen","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b187","doi-asserted-by":"crossref","unstructured":"Zhao Qingqing, et al. Cot-vla: Visual chain-of-thought reasoning for vision-language-action models. In: Proceedings of the computer vision and pattern recognition conference. 2025.","DOI":"10.1109\/CVPR52734.2025.00166"},{"key":"10.1016\/j.array.2026.100739_b188","series-title":"MinD: Unified visual imagination and control via hierarchical world models","author":"Chi","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b189","series-title":"Hume: Introducing system-2 thinking in visual-language-action model","author":"Song","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b190","series-title":"CognitiveDrone: A VLA model and evaluation benchmark for real-time cognitive task solving and reasoning in UAVs","author":"Lykov","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b191","series-title":"TriVLA: A unified triple-system-based unified vision-language-action model for general robot control","author":"Liu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b192","series-title":"Inspire: Vision-language-action models with intrinsic spatial reasoning","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b193","series-title":"2025 IEEE International Conference on Robotics and Automation","article-title":"Run-time observation interventions make vision-language-action models more visually robust","author":"Hancock","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b194","series-title":"\u03c00.5: A vision-language-action model with open-world generalization","author":"Intelligence","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b195","series-title":"Dreamvla: a vision-language-action model dreamed with comprehensive world knowledge","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b196","series-title":"Gevrm: Goal-expressive video generation model for robust visual manipulation","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b197","unstructured":"Chen Yi, et al. Moto: Latent motion token as the bridging language for learning robot manipulation from videos. In: Proceedings of the IEEE\/CVF international conference on computer vision. 2025."},{"key":"10.1016\/j.array.2026.100739_b198","series-title":"DriveMoE: Mixture-of-experts for vision-language-action model in end-to-end autonomous driving","author":"Yang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b199","series-title":"Opendrivevla: Towards end-to-end autonomous driving with large vision language action model","author":"Zhou","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b200","series-title":"DreamGen: Unlocking generalization in robot learning through video world models","author":"Jang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b201","series-title":"Graspvla: a grasping foundation model pre-trained on billion-scale synthetic action data","author":"Deng","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b202","series-title":"Enerverse: Envisioning embodied future space for robotics manipulation","author":"Huang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b203","series-title":"Racevla: Vla-based racing drone navigation with human-like behaviour","author":"Serpiva","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b204","doi-asserted-by":"crossref","unstructured":"Chen Hanzhi, et al. VidBot: Learning Generalizable 3D Actions from In-the-Wild 2D Human Videos for Zero-Shot Robotic Manipulation. In: Proceedings of the computer vision and pattern recognition conference. 2025.","DOI":"10.1109\/CVPR52734.2025.02576"},{"key":"10.1016\/j.array.2026.100739_b205","series-title":"Vtla: Vision-tactile-language-action model with preference learning for insertion manipulation","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b206","series-title":"Fine-tuning vision-language-action models: Optimizing speed and success","author":"Kim","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b207","series-title":"Pointvla: Injecting the 3d world into vision-language-action models","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b208","series-title":"Trackvla: Embodied visual tracking in the wild","author":"Wang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b209","series-title":"WorldVLA: Towards autoregressive action world model","author":"Cen","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b210","series-title":"Fp3: A 3d foundation policy for robotic manipulation","author":"Yang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b211","series-title":"Tracevla: Visual trace prompting enhances spatial\u2013temporal awareness for generalist robotic policies","author":"Zheng","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b212","series-title":"Gr00t n1: An open foundation model for generalist humanoid robots","author":"Bjorck","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b213","series-title":"Interleave-vla: Enhancing robot manipulation with interleaved image-text instructions","author":"Fan","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b214","series-title":"Objectvla: End-to-end open-world object manipulation without demonstration","author":"Zhu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b215","doi-asserted-by":"crossref","unstructured":"Wu Zhenyu, et al. Momanipvla: Transferring vision-language-action models for general mobile manipulation. In: Proceedings of the computer vision and pattern recognition conference. 2025.","DOI":"10.1109\/CVPR52734.2025.00167"},{"key":"10.1016\/j.array.2026.100739_b216","series-title":"Switchvla: Execution-aware task switching for vision-language-action models","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b217","series-title":"Fast: Efficient action tokenization for vision-language-action models","author":"Pertsch","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b218","series-title":"A0: An affordance-aware hierarchical model for general robotic manipulation","author":"Xu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b219","series-title":"Spatialvla: Exploring spatial representations for visual-language-action model","author":"Qu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b220","series-title":"Pixel motion as universal representation for robot control","author":"Ranasinghe","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b221","series-title":"Evo-0: Vision-language-action model with implicit spatial understanding","author":"Lin","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b222","series-title":"Vla-cache: Towards efficient vision-language-action model via adaptive token caching in robotic manipulation","author":"Xu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b223","series-title":"ELEMENTAL: Interactive learning from demonstrations and vision-language models for reward design in robotics","author":"Chen","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b224","series-title":"Mole-vla: Dynamic layer-skipping vision language action model via mixture-of-layers for efficient robot manipulation","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b225","series-title":"Navila: Legged robot vision-language-action model for navigation","author":"Cheng","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b226","series-title":"Accelerating vision-language-action model integrated with action chunking via parallel decoding","author":"Song","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b227","series-title":"SafeVLA: Towards safety alignment of vision-language-action model via constrained learning","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b228","series-title":"BitVLA: 1-bit vision-language-action models for robotics manipulation","author":"Wang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b229","series-title":"Improving vision-language-action model with online reinforcement learning","author":"Guo","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b230","article-title":"Gr-mg: Leveraging partially-annotated data via multi-modal goal-conditioned policy","author":"Li","year":"2025","journal-title":"IEEE Robot Autom Lett"},{"key":"10.1016\/j.array.2026.100739_b231","series-title":"ReinboT: Amplifying robot visual-language manipulation with reinforcement learning","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b232","series-title":"LoHoVLA: A unified vision-language-action model for long-horizon embodied tasks","author":"Yang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b233","series-title":"Conrft: A reinforced fine-tuning method for vla models via consistency policy","author":"Chen","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b234","series-title":"Otter: A vision-language-action model with text-aware visual feature extraction","author":"Huang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b235","series-title":"More: Unlocking scalability in reinforcement learning for quadruped vision-language-action models","author":"Zhao","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b236","doi-asserted-by":"crossref","unstructured":"Zhou Zhongyi, et al. Chatvla: Unified multimodal understanding and robot control with vision-language-action model. In: Proceedings of the 2025 conference on empirical methods in natural language processing. 2025.","DOI":"10.18653\/v1\/2025.emnlp-main.273"},{"key":"10.1016\/j.array.2026.100739_b237","series-title":"Simplevla-rl: Scaling vla training via reinforcement learning","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b238","series-title":"Leverb: Humanoid whole-body control with latent vision-language instruction","author":"Xue","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b239","series-title":"Helix: A vision-language-action model for generalist humanoid control","author":"Figure","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b240","series-title":"AutoVLA: A vision-language-action model for end-to-end autonomous driving with adaptive reasoning and reinforcement fine-tuning","author":"Zhou","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b241","series-title":"Autort: Embodied foundation models for large scale orchestration of robotic agents","author":"Ahn","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b242","series-title":"Refined policy distillation: From VLA generalists to RL experts","author":"J\u00fclg","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b243","doi-asserted-by":"crossref","unstructured":"Wu Zhenyu, et al. Momanipvla: Transferring vision-language-action models for general mobile manipulation. In: Proceedings of the Computer Vision and Pattern Recognition Conference. 2025.","DOI":"10.1109\/CVPR52734.2025.00167"},{"key":"10.1016\/j.array.2026.100739_b244","series-title":"RLRC: Reinforcement learning-based recovery for compressed vision-language-action models","author":"Chen","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b245","doi-asserted-by":"crossref","unstructured":"Wang Feiyang, Yu Xiaomin, Wu Wangyu. CubeRobot: Grounding Language in Rubik\u2019s Cube Manipulation via Vision-Language Model. In: Companion Proceedings of the ACM on web conference 2025. 2025.","DOI":"10.1145\/3701716.3717565"},{"key":"10.1016\/j.array.2026.100739_b246","series-title":"Vla-rl: Towards masterful and general robotic manipulation with scalable reinforcement learning","author":"Lu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b247","unstructured":"Wang Taowen, et al. Exploring the adversarial vulnerabilities of vision-language-action models in robotics. In: Proceedings of the IEEE\/CVF international conference on computer vision. 2025."},{"key":"10.1016\/j.array.2026.100739_b248","series-title":"AutoDrive-R 2: Incentivizing reasoning and self-reflection capacity for VLA model in autonomous driving","author":"Yuan","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b249","series-title":"Droid: A large-scale in-the-wild robot manipulation dataset","author":"Khazatsky","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b250","series-title":"ReWiND: Language-guided rewards teach robot policies without new demonstrations","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b251","series-title":"ViSA-Flow: Accelerating robot skill learning via large-scale video semantic action flow","author":"Chen","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b252","series-title":"Embodied-r1: Reinforced embodied reasoning for general robotic manipulation","author":"Yuan","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b253","series-title":"2025 IEEE international conference on robotics and automation","article-title":"Effective tuning strategies for generalist robot manipulation policies","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b254","series-title":"Irl-vla: Training an vision-language-action policy via reward world model","author":"Jiang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b255","series-title":"Training strategies for efficient embodied reasoning","author":"Chen","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b256","series-title":"Thinkact: Vision-language-action reasoning via reinforced visual latent planning","author":"Huang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b257","series-title":"Cast: Counterfactual labels improve instruction following in vision-language-action models","author":"Glossop","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b258","series-title":"Hybridvla: Collaborative diffusion and autoregression in a unified vision-language-action model","author":"Liu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b259","doi-asserted-by":"crossref","unstructured":"Ji Yuheng, et al. Robobrain: A unified brain model for robotic manipulation from abstract to concrete. In: Proceedings of the computer vision and pattern recognition conference. 2025.","DOI":"10.1109\/CVPR52734.2025.00168"},{"key":"10.1016\/j.array.2026.100739_b260","series-title":"Rationalvla: A rational vision-language-action model with dual system","author":"Song","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b261","series-title":"CEED-VLA: Consistency vision-language-action model with early-exit decoding","author":"Song","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b262","series-title":"Openhelix: A short survey, empirical analysis, and open-source dual-system vla model for robotic manipulation","author":"Cui","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b263","series-title":"2025 IEEE International Conference on Robotics and Automation","article-title":"Revla: Reverting visual domain limitation of robotic foundation models","author":"Dey","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b264","series-title":"Egovla: Learning vision-language-action models from egocentric human videos","author":"Yang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b265","series-title":"SAFE: Multitask failure detection for vision-language-action models","author":"Gu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b266","series-title":"ACTLLM: Action consistency tuned large language model","author":"Bi","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b267","series-title":"Dywa: Dynamics-adaptive world action model for generalizable non-prehensile manipulation","author":"Lyu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b268","series-title":"CrayonRobo: Object-centric prompt-driven vision-language-action model for robotic manipulation","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b269","series-title":"BridgeVLA: Input-output alignment for efficient 3D manipulation learning with vision-language models","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b270","series-title":"cVLA: Towards efficient camera-space VLAs","author":"Argus","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b271","series-title":"Geomanip: Geometric constraints as general interfaces for robot manipulation","author":"Tang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b272","series-title":"Real-time execution of action chunking flow policies","author":"Black","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b273","series-title":"Ttf-vla: Temporal token fusion via pixel-attention integration for vision-language-action models","author":"Liu","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b274","series-title":"Memoryvla: Perceptual-cognitive memory in vision-language-action models for robotic manipulation","author":"Shi","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b275","unstructured":"Zhang Tianyu, et al. VCR: A Task for Pixel-Level Complex Reasoning in Vision Language Models via Restoring Occluded Text. In: The thirteenth international conference on learning representations."},{"key":"10.1016\/j.array.2026.100739_b276","first-page":"131035","article-title":"VLM4Bio: A benchmark dataset to evaluate pretrained vision-language models for trait discovery from biological images","volume":"37","author":"Maruf","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b277","series-title":"A survey on vision-language-action models for embodied ai","author":"Ma","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b278","series-title":"Towards Generalist Robots: Learning Paradigms for Scalable Skill Acquisition@ CoRL2023","article-title":"Open x-embodiment: Robotic learning datasets and rt-x models","author":"Vuong","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b279","series-title":"ERVQA: A dataset to benchmark the readiness of large vision language models in hospital environments","author":"Ray","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b280","doi-asserted-by":"crossref","unstructured":"Lin Kevin Qinghong, et al. Showui: One vision-language-action model for gui visual agent. In: Proceedings of the computer vision and pattern recognition conference. 2025.","DOI":"10.1109\/CVPR52734.2025.01816"},{"key":"10.1016\/j.array.2026.100739_b281","series-title":"Fine-tuning vision-language-action models: Optimizing speed and success","author":"Kim","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b282","series-title":"SkySenseGPT: A fine-grained instruction tuning dataset and model for remote sensing vision-language understanding","author":"Luo","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b283","series-title":"Openvla: An open-source vision-language-action model","author":"Kim","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b284","series-title":"TaskClip: Extend large vision-language model for task oriented object detection","author":"Chen","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b285","article-title":"RS5m and GeoRSCLIP: A large scale vision-language dataset and a large vision-language model for remote sensing","author":"Zhang","year":"2024","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"10.1016\/j.array.2026.100739_b286","doi-asserted-by":"crossref","first-page":"140632","DOI":"10.52202\/079017-4464","article-title":"VHELM: A holistic evaluation of vision language models","volume":"37","author":"Lee","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b287","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103271","article-title":"HumanVLM: Foundation for human-scene vision-language model","author":"Dai","year":"2025","journal-title":"Inf Fusion"},{"key":"10.1016\/j.array.2026.100739_b288","series-title":"Talking to DINO: Bridging self-supervised vision backbones with language for open-vocabulary segmentation","author":"Barsellotti","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b289","series-title":"IllusionVQA: A challenging optical illusion dataset for vision language models","author":"Shahgir","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b290","series-title":"VLLFL: A vision-language model based lightweight federated learning framework for smart agriculture","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b291","doi-asserted-by":"crossref","unstructured":"Chen Jieneng, et al. ViTamin: Designing scalable vision models in the vision-language era. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.01231"},{"key":"10.1016\/j.array.2026.100739_b292","doi-asserted-by":"crossref","unstructured":"Ardakani Arash, et al. Slimfit: Memory-efficient fine-tuning of transformer-based models using training dynamics. In: Proceedings of the 2024 conference of the North American chapter of the association for computational linguistics: human language technologies (volume 1: long papers). 2024.","DOI":"10.18653\/v1\/2024.naacl-long.345"},{"issue":"10","key":"10.1016\/j.array.2026.100739_b293","doi-asserted-by":"crossref","first-page":"852","DOI":"10.1038\/s43588-025-00871-0","article-title":"Vision language models excel at perception but struggles with scientific reasoning","volume":"5","author":"Jablonka","year":"2025","journal-title":"Nat Comput Sci"},{"key":"10.1016\/j.array.2026.100739_b294","series-title":"LVLM-interpret: An interpretability tool for large vision-language models","author":"Stan","year":"2024"},{"issue":"1","key":"10.1016\/j.array.2026.100739_b295","doi-asserted-by":"crossref","first-page":"277","DOI":"10.1038\/s43856-024-00709-2","article-title":"Development of a large-scale medical visual question-answering dataset","volume":"4","author":"Zhang","year":"2024","journal-title":"Commun Med"},{"key":"10.1016\/j.array.2026.100739_b296","series-title":"European conference on computer vision","article-title":"Vitatecs: A diagnostic dataset for temporal concept understanding of video-language models","author":"Li","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b297","series-title":"Proceedings of the AAAI conference on artificial intelligence","article-title":"CVLUE: A new benchmark dataset for Chinese vision-language understanding evaluation","volume":"vol. 39. no. 8","author":"Wang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b298","series-title":"Vlfeedback: A large-scale AI feedback dataset for large vision-language models alignment","author":"Li","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b299","doi-asserted-by":"crossref","first-page":"e32690","DOI":"10.2196\/32690","article-title":"Vision-language model for generating textual descriptions from clinical images: Model development and validation study","volume":"8","author":"Ji","year":"2024","journal-title":"JMIR Form Res"},{"key":"10.1016\/j.array.2026.100739_b300","series-title":"MiniVLM: A smaller and faster vision-language model","author":"Wang","year":"2020"},{"key":"10.1016\/j.array.2026.100739_b301","series-title":"Mini-gemini: Mining the potential of multi-modality vision language models","author":"Li","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b302","series-title":"Vision-language pre-training: Basics, recent advances, and future trends","first-page":"163","author":"Gan","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b303","series-title":"A survey of attacks on large vision-language models: Resources, advances, and future trends","author":"Liu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b304","doi-asserted-by":"crossref","unstructured":"Chen Boyuan, et al. SpatialVLM: Endowing vision-language models with spatial reasoning capabilities. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"10.1016\/j.array.2026.100739_b305","series-title":"Exploring the frontier of vision-language models: A survey of current methodologies and future directions","author":"Ghosh","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b306","series-title":"A survey of vision-language pre-trained models","author":"Du","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b307","article-title":"Vision-language models for vision tasks: A survey","author":"Zhang","year":"2024","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10.1016\/j.array.2026.100739_b308","doi-asserted-by":"crossref","unstructured":"Li Zongxia, et al. A survey of state of the art large vision language models: Benchmark evaluations and challenges. In: Proceedings of the computer vision and pattern recognition conference. 2025, p. 1587\u2013606.","DOI":"10.1109\/CVPRW67362.2025.00147"},{"key":"10.1016\/j.array.2026.100739_b309","series-title":"Vision-language model for object detection and segmentation: A review and evaluation","author":"Feng","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b310","doi-asserted-by":"crossref","unstructured":"Shinde Gaurav, et al. A Survey on Efficient Vision-Language Models. In: Wiley interdisciplinary reviews: Data mining and knowledge discovery. 2025, e70036.","DOI":"10.1002\/widm.70036"},{"key":"10.1016\/j.array.2026.100739_b311","series-title":"Frontiers in cyber security","article-title":"Large vision-language model security: A survey","volume":"vol. 2315","author":"Wang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b312","series-title":"A survey on vision-language-action models for embodied ai","author":"Ma","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b313","doi-asserted-by":"crossref","unstructured":"Chen Zhe, et al. InternVL: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"10.1016\/j.array.2026.100739_b314","series-title":"International conference on machine learning","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020"},{"key":"10.1016\/j.array.2026.100739_b315","series-title":"InternLM-XComposer-2.5: A versatile large vision language model supporting long-contextual input and output","author":"Zhang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b316","series-title":"ICLR","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b317","doi-asserted-by":"crossref","DOI":"10.1109\/TWC.2025.3564356","article-title":"Task-driven priority-aware computation offloading using deep reinforcement learning","author":"Hao","year":"2025","journal-title":"IEEE Trans Wirel Commun"},{"key":"10.1016\/j.array.2026.100739_b318","doi-asserted-by":"crossref","DOI":"10.1109\/TC.2025.3604463","article-title":"Reliability-aware optimization of task offloading for uav-assisted edge computing","author":"Hao","year":"2025","journal-title":"IEEE Trans Comput"},{"key":"10.1016\/j.array.2026.100739_b319","series-title":"Explaining and harnessing adversarial examples","author":"Goodfellow","year":"2014"},{"key":"10.1016\/j.array.2026.100739_b320","series-title":"A survey of safety on large vision-language models: Attacks, defenses and evaluations","author":"Ye","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b321","series-title":"International Conference on Frontiers in Cyber Security","article-title":"Large vision-language model security: A survey","author":"Wang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b322","series-title":"International Conference on Artificial Neural Networks","article-title":"Unveiling vulnerabilities in large vision-language models: The SAVJ jailbreak approach","author":"Zhang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b323","series-title":"DeepSeek-VL: Towards real-world vision-language understanding","author":"Lu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b324","article-title":"Lvlm-ehub: A comprehensive evaluation benchmark for large vision-language models","author":"Xu","year":"2024","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10.1016\/j.array.2026.100739_b325","first-page":"33120","article-title":"LOVM: Language-only vision model selection","volume":"36","author":"Zohar","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b326","series-title":"A survey of vision-language pre-trained models","author":"Du","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b327","doi-asserted-by":"crossref","unstructured":"Du Yu, et al. Learning to prompt for open-vocabulary object detection with vision-language model. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"10.1016\/j.array.2026.100739_b328","series-title":"Proceedings of the AAAI conference on artificial intelligence","article-title":"Unifying vision-language representation space with single-tower transformer","volume":"vol. 37. no. 1","author":"Jang","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b329","doi-asserted-by":"crossref","unstructured":"Zhai Xiaohua, et al. Scaling vision transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"10.1016\/j.array.2026.100739_b330","unstructured":"Devlin Jacob, et al. Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers). 2019."},{"key":"10.1016\/j.array.2026.100739_b331","series-title":"Visualbert: A simple and performant baseline for vision and language","author":"Li","year":"2019"},{"key":"10.1016\/j.array.2026.100739_b332","series-title":"LXMERT: Learning cross-modality encoder representations from transformers","author":"Tan","year":"2019"},{"key":"10.1016\/j.array.2026.100739_b333","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"Lu","year":"2019","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b334","series-title":"Measuring compositional generalization: A comprehensive method on realistic data","author":"Keysers","year":"2019"},{"key":"10.1016\/j.array.2026.100739_b335","doi-asserted-by":"crossref","unstructured":"He Kaiming, et al. Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"10.1016\/j.array.2026.100739_b336","doi-asserted-by":"crossref","unstructured":"Xie Zhenda, et al. Simmim: A simple framework for masked image modeling. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"10.1016\/j.array.2026.100739_b337","series-title":"Visual large language models for generalized and specialized applications","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b338","doi-asserted-by":"crossref","unstructured":"Zanella Maxime, Ayed Ismail Ben. On the test-time zero-shot generalization of vision-language models: Do we really need prompt learning?. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.02245"},{"key":"10.1016\/j.array.2026.100739_b339","first-page":"61501","article-title":"VisionLLM: Large language model is also an open-ended decoder for vision-centric tasks","volume":"36","author":"Wang","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b340","unstructured":"Al-Tahan Haider, et al. Scaling vision-language models does not improve relational understanding: The right learning objective helps. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024."},{"key":"10.1016\/j.array.2026.100739_b341","series-title":"Proceedings of the AAAI conference on artificial intelligence","article-title":"Benchmarking and understanding compositional relational reasoning of llms","volume":"vol. 39. no. 18","author":"Ni","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b342","series-title":"Lost in embeddings: Information loss in vision-language models","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b343","doi-asserted-by":"crossref","unstructured":"Aravindan AV, Jha A, Kulkarni M. Do VLMs Have Bad Eyes? Diagnosing Compositional Failures via Mechanistic Interpretability. In: Proceedings of the IEEE\/CVF international conference on computer vision. 2025, p. 704\u201312.","DOI":"10.1109\/ICCVW69036.2025.00079"},{"key":"10.1016\/j.array.2026.100739_b344","series-title":"Mixtral of experts","author":"Jiang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b345","doi-asserted-by":"crossref","first-page":"49205","DOI":"10.52202\/075280-2140","article-title":"Symbolic discovery of optimization algorithms","volume":"36","author":"Chen","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b346","series-title":"Unveiling the compositional ability gap in vision-language reasoning model","author":"Li","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b347","series-title":"Compositional attention: Disentangling search and retrieval","author":"Mittal","year":"2021"},{"key":"10.1016\/j.array.2026.100739_b348","doi-asserted-by":"crossref","unstructured":"Johnson Justin, et al. Clevr: A diagnostic dataset for compositional language and elementary visual reasoning. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 2017.","DOI":"10.1109\/CVPR.2017.215"},{"key":"10.1016\/j.array.2026.100739_b349","doi-asserted-by":"crossref","unstructured":"Thrush Tristan, et al. Winoground: Probing vision and language models for visio-linguistic compositionality. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"10.1016\/j.array.2026.100739_b350","doi-asserted-by":"crossref","unstructured":"Agrawal Aishwarya, et al. Don\u2019t just assume; look and answer: Overcoming priors for visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 2018.","DOI":"10.1109\/CVPR.2018.00522"},{"key":"10.1016\/j.array.2026.100739_b351","doi-asserted-by":"crossref","unstructured":"Hudson Drew A, Manning Christopher D. Gqa: A new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2019.","DOI":"10.1109\/CVPR.2019.00686"},{"key":"10.1016\/j.array.2026.100739_b352","doi-asserted-by":"crossref","first-page":"31096","DOI":"10.52202\/075280-1355","article-title":"Sugarcrepe: Fixing hackable benchmarks for vision-language compositionality","volume":"36","author":"Hsieh","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b353","series-title":"St-moe: Designing stable and transferable sparse expert models","author":"Zoph","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b354","series-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer","year":"2017"},{"key":"10.1016\/j.array.2026.100739_b355","article-title":"Moe-llava: Mixture of experts for large vision-language models","author":"Lin","year":"2026","journal-title":"IEEE Trans Multimed"},{"key":"10.1016\/j.array.2026.100739_b356","unstructured":"Gu Albert, Dao Tri. Mamba: Linear-time sequence modeling with selective state spaces. In: First conference on language modeling. 2024."},{"key":"10.1016\/j.array.2026.100739_b357","series-title":"Vision mamba: Efficient visual representation learning with bidirectional state space model","author":"Zhu","year":"2024"},{"issue":"1","key":"10.1016\/j.array.2026.100739_b358","doi-asserted-by":"crossref","first-page":"20","DOI":"10.1007\/s11263-025-02597-y","article-title":"Video mamba suite: State space model as a versatile alternative for video understanding","volume":"134","author":"Chen","year":"2026","journal-title":"Int J Comput Vis"},{"key":"10.1016\/j.array.2026.100739_b359","article-title":"Heterogeneous graph learning for visual commonsense reasoning","volume":"32","author":"Yu","year":"2019","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b360","doi-asserted-by":"crossref","first-page":"2473","DOI":"10.1109\/TMM.2021.3082292","article-title":"Graph-based visual-semantic entanglement network for zero-shot image recognition","volume":"24","author":"Hu","year":"2021","journal-title":"IEEE Trans Multimed"},{"key":"10.1016\/j.array.2026.100739_b361","series-title":"Compositional attention networks for interpretability in natural language question answering","author":"Selvakumar","year":"2018"},{"key":"10.1016\/j.array.2026.100739_b362","series-title":"The neuro-symbolic concept learner: Interpreting scenes, words, and sentences from natural supervision","author":"Mao","year":"2019"},{"key":"10.1016\/j.array.2026.100739_b363","first-page":"16705","article-title":"TAISU: A 166M large-scale high-quality dataset for Chinese vision-language pre-training","volume":"35","author":"Liu","year":"2022","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b364","series-title":"European conference on computer vision","article-title":"Slip: Self-supervision meets language-image pre-training","author":"Mu","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b365","article-title":"LVLM-EHub: A comprehensive evaluation benchmark for large vision-language models","author":"Xu","year":"2024","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"10.1016\/j.array.2026.100739_b366","article-title":"RemoteCLIP: A vision language foundation model for remote sensing","author":"Liu","year":"2024","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"10.1016\/j.array.2026.100739_b367","doi-asserted-by":"crossref","first-page":"272","DOI":"10.1016\/j.isprsjprs.2025.03.028","article-title":"RSGPT: A remote sensing vision language model and benchmark","volume":"224","author":"Hu","year":"2025","journal-title":"ISPRS J Photogramm Remote Sens"},{"key":"10.1016\/j.array.2026.100739_b368","series-title":"Learning fine-grained bimanual manipulation with low-cost hardware","author":"Zhao","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b369","series-title":"Octo: An open-source generalist robot policy","author":"Team","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b370","series-title":"Voxposer: Composable 3d value maps for robotic manipulation with language models","author":"Huang","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b371","unstructured":"Li Jinming, et al. Coavla: Improving vision-language-action models via visual-text chain-of-affordance. In: Proceedings of the IEEE\/CVF international conference on computer vision.. 2025."},{"key":"10.1016\/j.array.2026.100739_b372","series-title":"Seer: Language instructed video prediction with latent diffusion models","author":"Gu","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b373","series-title":"Interleave-vla: Enhancing robot manipulation with interleaved image-text instructions","author":"Fan","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b374","doi-asserted-by":"crossref","unstructured":"Yang Ze, et al. Unisim: A neural closed-loop sensor simulator. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2023.","DOI":"10.1109\/CVPR52729.2023.00140"},{"key":"10.1016\/j.array.2026.100739_b375","series-title":"Improving language understanding by generative pre-training","first-page":"3","author":"Radford","year":"2018"},{"key":"10.1016\/j.array.2026.100739_b376","series-title":"Integrating reinforcement learning with foundation models for autonomous robotics: Methods and perspectives","author":"Moroncelli","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b377","series-title":"Dinov2: Learning robust visual features without supervision","author":"Oquab","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b378","series-title":"International conference on machine learning","article-title":"Parameter-efficient transfer learning for NLP","author":"Houlsby","year":"2019"},{"key":"10.1016\/j.array.2026.100739_b379","series-title":"Tip-adapter: Training-free clip-adapter for better vision-language modeling","author":"Zhang","year":"2021"},{"key":"10.1016\/j.array.2026.100739_b380","series-title":"Prefix-tuning: Optimizing continuous prompts for generation","author":"Li","year":"2021"},{"key":"10.1016\/j.array.2026.100739_b381","doi-asserted-by":"crossref","unstructured":"Mercea Otniel-Bogdan, et al. Time-memory-and parameter-efficient visual adaptation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2024.","DOI":"10.1109\/CVPR52733.2024.00529"},{"key":"10.1016\/j.array.2026.100739_b382","article-title":"M2ist: Multi-modal interactive side-tuning for efficient referring expression comprehension","author":"Liu","year":"2025","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"10.1016\/j.array.2026.100739_b383","series-title":"Vl-bert: Pre-training of generic visual-linguistic representations","author":"Su","year":"2019"},{"key":"10.1016\/j.array.2026.100739_b384","series-title":"International conference on machine learning","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"10.1016\/j.array.2026.100739_b385","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","article-title":"FLAVA: A foundational language and vision alignment model","author":"Singh","year":"2022"},{"key":"10.1016\/j.array.2026.100739_b386","series-title":"Pumer: Pruning and merging tokens for efficient vision language models","author":"Cao","year":"2023"},{"key":"10.1016\/j.array.2026.100739_b387","series-title":"European conference on computer vision","article-title":"Turbo: Informativity-driven acceleration plug-in for vision-language large models","author":"Ju","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b388","first-page":"65252","article-title":"Swapprompt: Test-time prompt adaptation for vision-language models","volume":"36","author":"Ma","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b389","doi-asserted-by":"crossref","unstructured":"Fuchs Cl\u00e9ment, Zanella Maxime, Vleeschouwer Christophe De. Online Gaussian Test-Time Adaptation of Vision-Language Models. In: Proceedings of the Computer Vision and Pattern Recognition Conference. 2025.","DOI":"10.1109\/CVPRW67362.2025.00018"},{"key":"10.1016\/j.array.2026.100739_b390","series-title":"Efficient test-time prompt tuning for vision-language models","author":"Zhu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b391","unstructured":"Kendall Alex, Gal Yarin, Cipolla Roberto. Multi-task learning using uncertainty to weigh losses for scene geometry and semantics. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 2018."},{"key":"10.1016\/j.array.2026.100739_b392","doi-asserted-by":"crossref","unstructured":"He Kaiming, et al. Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2020.","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"10.1016\/j.array.2026.100739_b393","series-title":"Roberta: A robustly optimized bert pretraining approach","author":"Liu","year":"2019"},{"issue":"9","key":"10.1016\/j.array.2026.100739_b394","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for vision-language models","volume":"130","author":"Zhou","year":"2022","journal-title":"Int J Comput Vis"},{"key":"10.1016\/j.array.2026.100739_b395","doi-asserted-by":"crossref","unstructured":"Zhou Kaiyang, et al. Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"10.1016\/j.array.2026.100739_b396","series-title":"VOCO-LLaMA: Towards vision compression with large language models","author":"Ye","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b397","doi-asserted-by":"crossref","unstructured":"Dou Zi-Yi, et al. An empirical study of training end-to-end vision-and-language transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022.","DOI":"10.1109\/CVPR52688.2022.01763"},{"key":"10.1016\/j.array.2026.100739_b398","series-title":"Open-vocabulary object detection via vision and language knowledge distillation","author":"Gu","year":"2021"},{"key":"10.1016\/j.array.2026.100739_b399","series-title":"Vision language models in medicine","author":"Kalp\u00e9lb\u00e9","year":"2025"},{"issue":"2","key":"10.1016\/j.array.2026.100739_b400","doi-asserted-by":"crossref","first-page":"581","DOI":"10.1007\/s11263-023-01891-x","article-title":"CLIP-adapter: Better vision-language models with feature adapters","volume":"132","author":"Gao","year":"2024","journal-title":"Int J Comput Vis"},{"key":"10.1016\/j.array.2026.100739_b401","doi-asserted-by":"crossref","unstructured":"Miech Antoine, et al. HowTo100M: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF international conference on computer vision. 2019.","DOI":"10.1109\/ICCV.2019.00272"},{"key":"10.1016\/j.array.2026.100739_b402","series-title":"Allava: Harnessing GPT4v-synthesized data for lite vision-language models","author":"Chen","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b403","series-title":"VLM-R1: A stable and generalizable R1-style large vision-language model","author":"Shen","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b404","first-page":"26650","article-title":"LAMM: Language-assisted multi-modal instruction-tuning dataset, framework, and benchmark","volume":"36","author":"Yin","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b405","series-title":"A large-scale vision-language dataset derived from open scientific literature to advance biomedical generalist AI","author":"Lozano","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b406","series-title":"SPA-VL: A comprehensive safety preference alignment dataset for vision language model","author":"Zhang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b407","series-title":"VlBiasBench: A comprehensive benchmark for evaluating bias in large vision-language models","author":"Wang","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b408","doi-asserted-by":"crossref","first-page":"72842","DOI":"10.52202\/075280-3185","article-title":"VAST: A vision-audio-subtitle-text omni-modality foundation model and dataset","volume":"36","author":"Chen","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b409","series-title":"YesBut: A high-quality annotated multimodal dataset for evaluating satire comprehension capability of vision-language models","author":"Nandy","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b410","doi-asserted-by":"crossref","first-page":"22107","DOI":"10.52202\/079017-0696","article-title":"SeaFloorAI: A large-scale vision-language dataset for seafloor geological survey","volume":"37","author":"Nguyen","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"10.1016\/j.array.2026.100739_b411","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing","article-title":"SweepMM: A high-quality multimodal dataset for sweeping robots in home scenarios for vision-language model","author":"Xu","year":"2024"},{"key":"10.1016\/j.array.2026.100739_b412","series-title":"GIT: A generative image-to-text transformer for vision and language","author":"Wang","year":"2022"},{"issue":"12","key":"10.1016\/j.array.2026.100739_b413","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1007\/s11432-024-4187-3","article-title":"Mminstruct: A high-quality multi-modal instruction tuning dataset with extensive diversity","volume":"67","author":"Liu","year":"2024","journal-title":"Sci China Inf Sci"},{"key":"10.1016\/j.array.2026.100739_b414","series-title":"Slim: Sim-to-real legged instructive manipulation via long-horizon visuomotor learning","author":"Zhang","year":"2025"},{"key":"10.1016\/j.array.2026.100739_b415","series-title":"The Thirteenth International Conference on Learning Representations","article-title":"Vision language models are in-context value learners","author":"Ma","year":"2024"}],"container-title":["Array"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S2590005626000627?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S2590005626000627?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T09:18:24Z","timestamp":1777367904000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S2590005626000627"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":415,"alternative-id":["S2590005626000627"],"URL":"https:\/\/doi.org\/10.1016\/j.array.2026.100739","relation":{},"ISSN":["2590-0056"],"issn-type":[{"value":"2590-0056","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A systematic review of vision language models: Comprehensive analysis of architectures, applications, datasets and challenges towards robust multimodal intelligence","name":"articletitle","label":"Article Title"},{"value":"Array","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.array.2026.100739","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Author. Published by Elsevier Inc.","name":"copyright","label":"Copyright"}],"article-number":"100739"}}