{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T07:24:24Z","timestamp":1775892264529,"version":"3.50.1"},"reference-count":427,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Japan Science and Technology Agency (JST) and Cutting-edge Research and Development on Information and Communication Sciences","award":["JPMJCS24K6"],"award-info":[{"award-number":["JPMJCS24K6"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/access.2025.3609980","type":"journal-article","created":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T17:38:38Z","timestamp":1757957918000},"page":"162467-162504","source":"Crossref","is-referenced-by-count":13,"title":["Vision-Language-Action Models for Robotics: A Review Towards Real-World Applications"],"prefix":"10.1109","volume":"13","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7464-7187","authenticated-orcid":false,"given":"Kento","family":"Kawaharazuka","sequence":"first","affiliation":[{"name":"Department of Mechano-Informatics, The University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2806-9858","authenticated-orcid":false,"given":"Jihoon","family":"Oh","sequence":"additional","affiliation":[{"name":"Department of Mechano-Informatics, The University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1612-2064","authenticated-orcid":false,"given":"Jun","family":"Yamada","sequence":"additional","affiliation":[{"name":"Department of Engineering Science, University of Oxford, Oxford, U.K."}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6270-700X","authenticated-orcid":false,"given":"Ingmar","family":"Posner","sequence":"additional","affiliation":[{"name":"Department of Mechano-Informatics, The University of Tokyo, Tokyo, Japan"}]},{"given":"Yuke","family":"Zhu","sequence":"additional","affiliation":[{"name":"Department of Computer Science, The University of Texas at Austin, Austin, TX, USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv:2307.09288"},{"key":"ref2","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref3","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. 40th Int. Conf. Mach. Learn. (ICML)","author":"Li"},{"key":"ref4","article-title":"GPT-4o system card","author":"Hurst","year":"2024","journal-title":"arXiv:2410.21276"},{"key":"ref5","article-title":"Toward general-purpose robots via foundation models: A survey and meta-analysis","author":"Hu","year":"2023","journal-title":"arXiv:2312.08782"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1080\/01691864.2024.2408593"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1177\/02783649241281508"},{"key":"ref8","first-page":"287","article-title":"Do as I can, not as I say: Grounding language in robotic affordances","volume-title":"Proc. 6th Conf. Robot Learn. (CoRL)","author":"Ahn"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref10","first-page":"2165","article-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","volume-title":"Proc. 7th Conf. Robot Learn. (CoRL)","author":"Brohan"},{"key":"ref11","article-title":"A survey on vision-language-action models for embodied AI","author":"Ma","year":"2024","journal-title":"arXiv:2405.14093"},{"key":"ref12","article-title":"Vision-language-action models: Concepts, progress, applications and challenges","author":"Sapkota","year":"2025","journal-title":"arXiv:2505.04769"},{"key":"ref13","article-title":"A survey on vision-language-action models: An action tokenization perspective","author":"Zhong","year":"2025","journal-title":"arXiv:2507.01925"},{"key":"ref14","article-title":"Microsoft COCO captions: Data collection and evaluation server","author":"Chen","year":"2015","journal-title":"arXiv:1504.00325"},{"key":"ref15","first-page":"894","article-title":"CLIPort: What and where pathways for robotic manipulation","volume-title":"Proc. 5th Conf. Robot Learn. (CoRL)","author":"Shridhar"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.025"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611477"},{"key":"ref18","article-title":"OpenVLA: An open-source vision-language-action model","author":"Kim","year":"2024","journal-title":"arXiv:2406.09246"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.090"},{"key":"ref20","first-page":"1","article-title":"RDT-1B: A diffusion foundation model for bimanual manipulation","volume-title":"Proc. Int. Conf. Learn. Representat. (ICLR)","author":"Liu"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2025.xxi.010"},{"key":"ref22","first-page":"1","article-title":"Latent action pretraining from videos","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Ye"},{"key":"ref23","article-title":"\u03c00.5: A vision-language-action model with open-world generalization","author":"Intelligence","year":"2025","journal-title":"arXiv:2504.16054"},{"key":"ref24","article-title":"GR00T N1: An open foundation model for generalist humanoid robots","author":"Bjorck","year":"2025","journal-title":"arXiv:2503.14734"},{"key":"ref25","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn. (ICML)","author":"Radford"},{"key":"ref26","first-page":"726","article-title":"Transporter networks: Rearranging the visual world for robotic manipulation","volume-title":"Proc. Conf. Robot Learn. (CoRL)","author":"Zeng"},{"key":"ref27","first-page":"28091","article-title":"A generalist agent","volume-title":"Proc. Trans. Mach. Learn. Res.","author":"Reed"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref29","first-page":"66","article-title":"SentencePiece: A simple and language independent subword tokenizer and detokenizer for neural text processing","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process., Syst. Demonstrations","author":"Kudo"},{"key":"ref30","first-page":"1","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Dosovitskiy"},{"key":"ref31","first-page":"14975","article-title":"VIMA: Robot manipulation with multimodal prompts","volume-title":"Proc. 40th Int. Conf. Mach. Learn. (ICML)","volume":"202","author":"Jiang"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"issue":"140","key":"ref33","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2019","journal-title":"J. Mach. Learn. Res."},{"key":"ref34","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","volume-title":"Proc. 36th Int. Conf. Mach. Learn. (ICML)","author":"Tan"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"ref36","article-title":"Universal sentence encoder","author":"Cer","year":"2018","journal-title":"arXiv:1803.11175"},{"key":"ref37","first-page":"12786","article-title":"TokenLearner: Adaptive space-time tokenization for videos","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"34","author":"Ryoo"},{"key":"ref38","first-page":"8469","article-title":"PaLM-E: An embodied multimodal language model","volume-title":"Proc. 40th Int. Conf. Mach. Learn. (ICML)","author":"Driess"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01368"},{"key":"ref40","first-page":"70","article-title":"RT-sketch: Goal-conditioned imitation learning from hand-drawn sketches","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","author":"Sundaresan"},{"key":"ref41","first-page":"1","article-title":"RT-trajectory: Robotic task generalization via hindsight trajectory sketches","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Gu"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.049"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611597"},{"key":"ref44","article-title":"AutoRT: Embodied foundation models for large scale orchestration of robotic agents","author":"Ahn","year":"2024","journal-title":"arXiv:2401.12963"},{"key":"ref45","first-page":"23123","article-title":"Prismatic VLMs: Investigating the design space of visually-conditioned language models","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","volume":"235","author":"Karamcheti"},{"key":"ref46","article-title":"DINOv2: Learning robust visual features without supervision","author":"Oquab","year":"2023","journal-title":"arXiv:2304.07193"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.026"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref50","first-page":"1","article-title":"Transfusion: Predict the next token and diffuse images with one multi-modal model","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Zhou"},{"key":"ref51","article-title":"PaliGemma: A versatile 3B VLM for transfer","author":"Beyer","year":"2024","journal-title":"arXiv:2407.07726"},{"key":"ref52","article-title":"Flow matching for generative modeling","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Lipman"},{"key":"ref53","first-page":"1","article-title":"Neural discrete representation learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"van den Oord"},{"key":"ref54","article-title":"World model on million-length video and language with ringattention","author":"Liu","year":"2024","journal-title":"arXiv:2402.08268"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2025.XXI.012"},{"key":"ref56","article-title":"VIMA: General robot manipulation with multimodal prompts","author":"Jiang","year":"2022","journal-title":"arXiv:2210.03094"},{"key":"ref57","first-page":"3397","article-title":"Open-world object manipulation using pre-trained vision-language models","volume-title":"Proc. 7th Conf. Robot Learn. (CoRL)","author":"Stone"},{"key":"ref58","article-title":"Rt-sketch: Goal-conditioned imitation learning from hand-drawn sketches","author":"Sundaresan","year":"2024","journal-title":"arXiv:2403.02709"},{"key":"ref59","article-title":"Rt-trajectory: Robotic task generalization via hindsight trajectory sketches","author":"Gu","year":"2023","journal-title":"arXiv:2311.01977"},{"key":"ref60","first-page":"1","article-title":"RoboCat: A self-improving generalist agent for robotic manipulation","volume-title":"Proc. Trans. Mach. Learn. Res.","author":"Bousmalis"},{"key":"ref61","article-title":"Vision-language foundation models as effective robot imitators","author":"Li","year":"2023","journal-title":"arXiv:2311.01378"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.025"},{"key":"ref64","first-page":"141208","article-title":"BAKU: An efficient transformer for multi-task policy learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Haldar"},{"key":"ref65","article-title":"Actra: Optimized transformer architecture for vision-language-action models in robot learning","author":"Ma","year":"2024","journal-title":"arXiv:2408.01147"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10802706"},{"key":"ref67","first-page":"496","article-title":"Scaling cross-embodied learning: One policy for manipulation, navigation, locomotion and aviation","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","volume":"270","author":"Doshi"},{"key":"ref68","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Ho"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610665"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2025.3544909"},{"key":"ref71","article-title":"RoboBERT: An end-to-end multimodal robotic manipulation model","author":"Wang","year":"2025","journal-title":"arXiv:2502.07837"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02576"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.031"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.121"},{"key":"ref75","article-title":"DexGraspVLA: A vision-language-action framework towards general dexterous grasping","author":"Zhong","year":"2025","journal-title":"arXiv:2502.20900"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2025.XXI.074"},{"key":"ref77","article-title":"FP3: A 3D foundation policy for robotic manipulation","author":"Yang","year":"2025","journal-title":"arXiv:2503.08950"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02102"},{"key":"ref79","article-title":"Gripper keypose and object pointflow as interfaces for bimanual robotic manipulation","author":"Yang","year":"2025","journal-title":"arXiv:2504.17784"},{"key":"ref80","article-title":"Diffusion transformer policy","author":"Hou","year":"2024","journal-title":"arXiv:2410.15959"},{"key":"ref81","first-page":"20413","article-title":"An embodied generalist agent in 3D world","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","author":"Huang"},{"key":"ref82","article-title":"Unleashing large-scale video generative pre-training for visual robot manipulation","author":"Wu","year":"2023","journal-title":"arXiv:2312.13139"},{"key":"ref83","first-page":"40085","article-title":"Robomamba: Efficient vision-language-action model for robotic reasoning and manipulation","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"37","author":"Liu"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72652-1_21"},{"key":"ref85","first-page":"20","article-title":"LLaRA: Supercharging robot learning data for vision-language policy","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Li"},{"key":"ref86","first-page":"3157","article-title":"Robotic control via embodied chain-of-thought reasoning","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","author":"Zawalski"},{"key":"ref87","article-title":"3D-VLA: A 3D vision-language-action generative world model","author":"Zhen","year":"2024","journal-title":"arXiv:2403.09631"},{"key":"ref88","article-title":"RoboUniView: Visual-language model with unified view representation for robotic manipulation","author":"Liu","year":"2024","journal-title":"arXiv:2406.18977"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00195"},{"key":"ref90","article-title":"Diffusion-VLA: Generalizable and interpretable robot foundation model via self-generated reasoning","author":"Wen","year":"2024","journal-title":"arXiv:2412.03293"},{"key":"ref91","article-title":"DexVLA: Vision-language model with plug-in diffusion expert for general robot control","author":"Wen","year":"2025","journal-title":"arXiv:2502.05855"},{"key":"ref92","article-title":"ChatVLA: Unified multimodal understanding and robot control with vision-language-action model","author":"Zhou","year":"2025","journal-title":"arXiv:2502.14420"},{"key":"ref93","article-title":"ObjectVLA: End-to-end open-world object manipulation without demonstration","author":"Zhu","year":"2025","journal-title":"arXiv:2502.19250"},{"key":"ref94","article-title":"AgiBot world colosseo: A large-scale manipulation platform for scalable and intelligent embodied systems","author":"Bu","year":"2025","journal-title":"arXiv:2503.06669"},{"key":"ref95","article-title":"PointVLA: Injecting the 3D world into vision-language-action models","author":"Li","year":"2025","journal-title":"arXiv:2503.07511"},{"key":"ref96","article-title":"MoLe-VLA: Dynamic layer-skipping vision language action model via mixture-of-layers for efficient robot manipulation","author":"Zhang","year":"2025","journal-title":"arXiv:2503.20384"},{"key":"ref97","article-title":"Fast-in-slow: A dual-system foundation model unifying fast manipulation within slow reasoning","author":"Chen","year":"2025","journal-title":"arXiv:2506.01953"},{"key":"ref98","article-title":"CronusVLA: Transferring latent motion across time for multi-frame prediction in manipulation","author":"Li","year":"2025","journal-title":"arXiv:2506.19816"},{"key":"ref99","article-title":"HybridVLA: Collaborative diffusion and autoregression in a unified vision-language-action model","author":"Liu","year":"2025","journal-title":"arXiv:2503.10631"},{"key":"ref100","article-title":"GraspVLA: A grasping foundation model pre-trained on billion-scale synthetic action data","author":"Deng","year":"2025","journal-title":"arXiv:2505.03233"},{"key":"ref101","article-title":"OneTwoVLA: A unified vision-language-action model with adaptive reasoning","author":"Lin","year":"2025","journal-title":"arXiv:2505.11917"},{"key":"ref102","article-title":"Hume: Introducing system-2 thinking in visual-language-action model","author":"Song","year":"2025","journal-title":"arXiv:2505.21432"},{"key":"ref103","article-title":"SwitchVLA: Execution-aware task switching for vision-language-action models","author":"Li","year":"2025","journal-title":"arXiv:2506.03574"},{"key":"ref104","article-title":"CogACT: A foundational vision-language-action model for synergizing cognition and action in robotic manipulation","author":"Li","year":"2024","journal-title":"arXiv:2411.19650"},{"key":"ref105","article-title":"TrackVLA: Embodied visual tracking in the wild","author":"Wang","year":"2025","journal-title":"arXiv:2505.23189"},{"key":"ref106","article-title":"SmolVLA: A vision-language-action model for affordable and efficient robotics","author":"Shukor","year":"2025","journal-title":"arXiv:2506.01844"},{"key":"ref107","article-title":"Mind: Unified visual imagination and control via hierarchical world models","author":"Chi","year":"2025","journal-title":"arXiv:2506.18897"},{"key":"ref108","first-page":"9156","article-title":"Learning universal policies via text-guided video generation","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Du"},{"key":"ref109","first-page":"8633","article-title":"Video diffusion models","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Ho"},{"key":"ref110","article-title":"DreamGen: Unlocking generalization in robot learning through video world models","author":"Jang","year":"2025","journal-title":"arXiv:2505.12705"},{"key":"ref111","first-page":"1","article-title":"Gevrm: Goal-expressive video generation model for robust visual manipulation","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Zhang"},{"key":"ref112","first-page":"22304","article-title":"Compositional foundation models for hierarchical planning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Ajay"},{"key":"ref113","first-page":"3943","article-title":"Dreamitate: Real-world visuomotor policy learning via video generation","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","author":"Liang"},{"key":"ref114","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023","journal-title":"arXiv:2311.15127"},{"key":"ref115","first-page":"715","article-title":"Megapose: 6d pose estimation of novel objects via render & compare","volume-title":"Proc. 6th Conf. Robot Learn. (CoRL)","volume":"205","author":"Labb\u00e9"},{"key":"ref116","first-page":"1","article-title":"Zero-shot robotic manipulation with pre-trained image-editing diffusion models","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Black"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11127988"},{"key":"ref119","first-page":"1","article-title":"Learning to act from actionless videos through dense correspondences","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Ko"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00795"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.092"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73033-7_2"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73116-7_18"},{"key":"ref124","article-title":"Pixel motion as universal representation for robot control","author":"Ranasinghe","year":"2025","journal-title":"arXiv:2505.07817"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"ref126","article-title":"Moto: Latent motion token as the bridging language for learning robot manipulation from videos","author":"Chen","year":"2024","journal-title":"arXiv:2412.04445"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2025.xxi.014"},{"key":"ref128","article-title":"UniSkill: Imitating human videos via cross-embodiment skill representations","author":"Kim","year":"2025","journal-title":"arXiv:2505.08787"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref131","article-title":"GR-2: A generative video-language-action model with web-scale knowledge for robot manipulation","author":"Cheang","year":"2024","journal-title":"arXiv:2410.06158"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"ref133","first-page":"3581","article-title":"Semi-supervised learning with deep generative models","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"27","author":"Kingma"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2025.3526436"},{"key":"ref135","article-title":"GR-3 technical report","volume-title":"arXiv:2507.15493","author":"Cheang","year":"2025"},{"key":"ref136","article-title":"Qwen2.5-VL technical report","volume-title":"arXiv:2502.13923","author":"Bai","year":"2025"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref138","article-title":"Point-E: A system for generating 3D point clouds from complex prompts","author":"Nichol","year":"2022","journal-title":"arXiv:2212.08751"},{"key":"ref139","article-title":"FLARE: Robot learning with implicit world modeling","author":"Zheng","year":"2025","journal-title":"arXiv:2505.15659"},{"key":"ref140","article-title":"WorldVLA: Towards autoregressive action world model","author":"Cen","year":"2025","journal-title":"arXiv:2506.21539"},{"key":"ref141","article-title":"ViSA-flow: Accelerating robot skill learning via large-scale video semantic action flow","author":"Chen","year":"2025","journal-title":"arXiv:2505.01288"},{"key":"ref142","first-page":"67","article-title":"The theory of affordances","volume-title":"Perceiving, Acting, and Knowing: Toward an Ecological Psychology","author":"Gibson","year":"1977"},{"key":"ref143","first-page":"540","article-title":"VoxPoser: Composable 3D value maps for robotic manipulation with language models","volume-title":"Proc. 7th Conf. Robot Learn. (CoRL)","author":"Huang"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20080-9_42"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref146","article-title":"Affordance-guided reinforcement learning via visual prompting","author":"Lee","year":"2024","journal-title":"arXiv:2407.10341"},{"key":"ref147","first-page":"178","article-title":"Language embedded radiance fields for zero-shot task-oriented grasping","volume-title":"Proc. 7th Conf. Robot Learn. (CoRL)","author":"Rashid"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_24"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01807"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01146"},{"key":"ref152","first-page":"4748","article-title":"Splat-mover: Multi-stage, open-vocabulary robotic manipulation via editable Gaussian splatting","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","volume":"270","author":"Shorinwa"},{"issue":"4","key":"ref153","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3592433","article-title":"3D Gaussian splatting for real-time radiance field rendering","volume":"42","author":"Kerbl","year":"2023","journal-title":"ACM Trans. Graph."},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01324"},{"key":"ref155","first-page":"753","article-title":"Scaling egocentric vision: The dataset","volume-title":"Proc. ECCV","author":"Damen"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01531-2"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00989"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.068"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02576"},{"key":"ref160","first-page":"4005","article-title":"Robopoint: A vision-language model for spatial affordance prediction in robotics","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","volume":"270","author":"Yuan"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02099"},{"key":"ref162","article-title":"RT-affordance: Affordances are versatile intermediate representations for robot manipulation","author":"Nasiriany","year":"2024","journal-title":"arXiv:2411.02704"},{"key":"ref163","article-title":"A0: An affordance-aware hierarchical model for general robotic manipulation","author":"Xu","year":"2025","journal-title":"arXiv:2504.12636"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00168"},{"key":"ref165","article-title":"CoA-VLA: Improving vision-language-action models via visual-textual chain-of-affordance","author":"Li","year":"2024","journal-title":"arXiv:2412.20451"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref169","article-title":"LAION-400M: Open dataset of CLIP-filtered 400 million image-text pairs","volume-title":"Proc. Neurips Data-Centric AI Workshop","author":"Schuhmann"},{"key":"ref170","first-page":"25278","article-title":"LAION-5B: An open large-scale dataset for training next generation image-text models","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Schuhmann"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"ref172","article-title":"EVA-CLIP: Improved training techniques for CLIP at scale","author":"Sun","year":"2023","journal-title":"arXiv:2303.15389"},{"key":"ref173","first-page":"23716","article-title":"Flamingo: A visual language model for few-shot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Alayrac"},{"key":"ref174","article-title":"ORION: A holistic end-to-end autonomous driving framework by vision-language instructed action generation","author":"Fu","year":"2025","journal-title":"arXiv:2503.19755"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00304"},{"key":"ref178","first-page":"3982","article-title":"Sentence-BERT: Sentence embeddings using Siamese BERT-networks","volume-title":"Proc. Conf. Empirical Methods Natural Lang. Process. 9th Int. Joint Conf. Natural Lang. Process. (EMNLP-IJCNLP)","author":"Reimers"},{"key":"ref179","article-title":"DistilBERT, a distilled version of BERT: Smaller, faster, cheaper and lighter","author":"Sanh","year":"2019","journal-title":"arXiv:1910.01108"},{"key":"ref180","volume-title":"Vicuna: An Open-Source Chatbot Impressing GPT-4 With 90% ChatGPT Quality","author":"Chiang","year":"2023"},{"key":"ref181","article-title":"Gemma: Open models based on Gemini research and technology","author":"Team","year":"2024","journal-title":"arXiv:2403.08295"},{"key":"ref182","article-title":"Qwen2 technical report","volume-title":"arXiv:2407.10671","author":"Yang","year":"2024"},{"issue":"3","key":"ref183","first-page":"1","article-title":"Phi-2: The surprising power of small language models","volume":"1","author":"Javaheripi","year":"2023","journal-title":"Microsoft Res. Blog"},{"key":"ref184","article-title":"SmolLM2: When smol goes big\u2014Data-centric training of a small language model","author":"Allal","year":"2025","journal-title":"arXiv:2502.02737"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.bigscience-1.9"},{"key":"ref186","first-page":"2397","article-title":"Pythia: A suite for analyzing large language models across training and scaling","volume-title":"Proc. 40th Int. Conf. Mach. Learn. (ICML)","author":"Biderman"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00166"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2025.xxi.017"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref190","first-page":"1","article-title":"Denoising diffusion implicit models","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Song"},{"key":"ref191","article-title":"Time-unified diffusion policy with action discrimination for robotic manipulation","author":"Niu","year":"2025","journal-title":"arXiv:2506.09422"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2025.XXI.011"},{"key":"ref193","article-title":"ForceVLA: Enhancing VLA models with a force-aware MoE for contact-rich manipulation","author":"Yu","year":"2025","journal-title":"arXiv:2505.22159"},{"key":"ref194","article-title":"IManip: Skill-incremental learning for robotic manipulation","author":"Zheng","year":"2025","journal-title":"arXiv:2503.07087"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.093"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02096"},{"key":"ref197","article-title":"SOLAMI: Social vision-language-action modeling for immersive interaction with 3D autonomous characters","author":"Jiang","year":"2024","journal-title":"arXiv:2412.00174"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11127987"},{"key":"ref199","first-page":"1","article-title":"Vlas: Vision-language-action model with speech instructions for customized robot manipulation","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Zhao"},{"key":"ref200","article-title":"MultiGen: Using multimodal generation in simulation to learn multimodal policies in real","author":"Wang","year":"2025","journal-title":"arXiv:2507.02864"},{"key":"ref201","first-page":"1","article-title":"Speechtokenizer: Unified speech tokenizer for speech language models","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Zhang"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-698"},{"key":"ref203","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. 40th Int. Conf. Mach. Learn. (ICML)","author":"Radford"},{"key":"ref204","article-title":"SoundStorm: Efficient parallel audio generation","author":"Borsos","year":"2023","journal-title":"arXiv:2305.09636"},{"key":"ref205","article-title":"RoboNurse-VLA: Robotic scrub nurse system based on vision-language-action model","author":"Li","year":"2024","journal-title":"arXiv:2409.19590"},{"key":"ref206","article-title":"TLA: Tactile-language-action model for contact-rich manipulation","author":"Hao","year":"2025","journal-title":"arXiv:2503.08548"},{"key":"ref207","article-title":"VTLA: Vision-tactile-language-action model with preference learning for insertion manipulation","author":"Zhang","year":"2025","journal-title":"arXiv:2505.09577"},{"key":"ref208","article-title":"Tactile-VLA: Unlocking vision-language-action model\u2019s physical knowledge for tactile generalization","author":"Huang","year":"2025","journal-title":"arXiv:2507.09160"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2977257"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/TIE.2023.3312418"},{"key":"ref211","first-page":"14080","article-title":"A touch, vision, and language dataset for multimodal alignment","volume-title":"Proc. 41st Int. Conf. Mach. Learn. (ICML)","author":"Fu"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"ref213","article-title":"ZoeDepth: Zero-shot transfer by combining relative and metric depth","author":"Bhat","year":"2023","journal-title":"arXiv:2302.12288"},{"key":"ref214","article-title":"HAMSTER: Hierarchical action models for open-world robot manipulation","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Li"},{"key":"ref215","article-title":"RationalVLA: A rational vision-language-action model with dual system","author":"Song","year":"2025","journal-title":"arXiv:2506.10826"},{"key":"ref216","article-title":"OpenHelix: A short survey, empirical analysis, and open-source dual-system VLA model for robotic manipulation","author":"Cui","year":"2025","journal-title":"arXiv:2505.03912"},{"key":"ref217","first-page":"1949","article-title":"3D diffuser actor: Policy diffusion with 3D scene representations","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","author":"Ke"},{"key":"ref218","article-title":"Evo-0: Vision-language-action model with implicit spatial understanding","author":"Lin","year":"2025","journal-title":"arXiv:2507.00416"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00499"},{"key":"ref220","article-title":"RoboMM: All-in-one multimodal large model for robotic manipulation","author":"Yan","year":"2024","journal-title":"arXiv:2412.07215"},{"key":"ref221","article-title":"SAM2Act: Integrating visual foundation model with a memory architecture for robotic manipulation","author":"Fang","year":"2025","journal-title":"arXiv:2501.18564"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.055"},{"key":"ref223","article-title":"OG-VLA: 3D-aware vision language action model via orthographic image generation","author":"Singh","year":"2025","journal-title":"arXiv:2506.01196"},{"key":"ref224","article-title":"BridgeVLA: Input\u2013output alignment for efficient 3D manipulation learning with vision-language models","author":"Li","year":"2025","journal-title":"arXiv:2506.07961"},{"key":"ref225","article-title":"OccLLaMA: An occupancy-language-action generative world model for autonomous driving","author":"Wei","year":"2024","journal-title":"arXiv:2409.03272"},{"key":"ref226","article-title":"OpenDriveVLA: Towards end-to-end autonomous driving with large vision language action model","author":"Zhou","year":"2025","journal-title":"arXiv:2503.23463"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_31"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.16"},{"key":"ref229","first-page":"1","article-title":"PointNet++: Deep hierarchical feature learning on point sets in a metric space","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Qi"},{"key":"ref230","first-page":"23192","article-title":"PointNeXt: Revisiting PointNet++ with improved training and scaling strategies","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Qian"},{"key":"ref231","article-title":"Uni3D: Exploring unified 3D representation at scale","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Zhou"},{"key":"ref232","article-title":"SoFar: Language-grounded orientation bridges spatial reasoning and object manipulation","author":"Qi","year":"2025","journal-title":"arXiv:2502.13143"},{"key":"ref233","article-title":"Integrating LMM planners and 3D skill policies for generalizable manipulation","author":"Li","year":"2025","journal-title":"arXiv:2501.18733"},{"key":"ref234","first-page":"1541","article-title":"General flow as foundation affordance for scalable robot learning","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","author":"Yuan"},{"key":"ref235","article-title":"DexTOG: Learning task-oriented dexterous grasp with language","author":"Zhang","year":"2025","journal-title":"arXiv:2504.04573"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-021-0229-5"},{"key":"ref237","first-page":"1","article-title":"PointCNN: Convolution on x-transformed points","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"31","author":"Li"},{"key":"ref238","article-title":"Pre-training auto-regressive robotic models with 4D representations","author":"Niu","year":"2025","journal-title":"arXiv:2502.13142"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01123"},{"key":"ref240","first-page":"20067","article-title":"MotionGPT: Human motion as a foreign language","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Jiang"},{"key":"ref241","article-title":"An atomic skill library construction method for data-efficient embodied manipulation","author":"Li","year":"2025","journal-title":"arXiv:2501.15068"},{"key":"ref242","article-title":"Hi robot: Open-ended instruction following with hierarchical vision-language-action models","author":"Xiaoyang Shi","year":"2025","journal-title":"arXiv:2502.19417"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2025.xxi.018"},{"key":"ref244","article-title":"Humanoid-VLA: Towards universal humanoid control with visual integration","author":"Ding","year":"2025","journal-title":"arXiv:2502.14795"},{"key":"ref245","article-title":"LoHoVLA: A unified vision-language-action model for long-horizon embodied tasks","author":"Yang","year":"2025","journal-title":"arXiv:2506.00411"},{"key":"ref246","article-title":"A dual process VLA: Efficient robotic manipulation leveraging VLM","author":"Han","year":"2024","journal-title":"arXiv:2410.15549"},{"key":"ref247","article-title":"TriVLA: A triple-system-based unified vision-language-action model for general robot control","author":"Liu","year":"2025","journal-title":"arXiv:2507.01424"},{"key":"ref248","article-title":"Training strategies for efficient embodied reasoning","author":"Chen","year":"2025","journal-title":"arXiv:2505.08243"},{"key":"ref249","article-title":"Fast ECoT: Efficient embodied chain-of-thought via thoughts reuse","author":"Duan","year":"2025","journal-title":"arXiv:2506.07639"},{"key":"ref250","article-title":"In-context imitation learning via next-token prediction","author":"Fu","year":"2024","journal-title":"arXiv:2408.15980"},{"key":"ref251","article-title":"Temporal representation alignment: Successor features enable emergent compositionality in robot instruction following","author":"Myers","year":"2025","journal-title":"arXiv:2502.05454"},{"key":"ref252","article-title":"Proximal policy optimization algorithms","author":"Schulman","year":"2017","journal-title":"arXiv:1707.06347"},{"key":"ref253","first-page":"1861","article-title":"Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor","volume-title":"Proc. 35th Int. Conf. Mach. Learn. (ICML)","author":"Haarnoja"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11127299"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2025.XXI.019"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610040"},{"key":"ref257","article-title":"Precise and dexterous robotic manipulation via human-in-the-loop reinforcement learning","author":"Luo","year":"2024","journal-title":"arXiv:2410.21845"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2015.7354297"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561384"},{"key":"ref260","first-page":"1577","article-title":"Efficient online reinforcement learning with offline data","volume-title":"Proc. 40th Int. Conf. Mach. Learn. (ICML)","author":"Ball"},{"key":"ref261","article-title":"VLA-RL: Towards masterful and general robotic manipulation with scalable reinforcement learning","author":"Lu","year":"2025","journal-title":"arXiv:2505.18719"},{"key":"ref262","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2025.xxi.028"},{"key":"ref263","article-title":"RLRC: Reinforcement learning-based recovery for compressed vision-language-action models","author":"Chen","year":"2025","journal-title":"arXiv:2506.17639"},{"key":"ref264","article-title":"Steering your diffusion policy with latent space reinforcement learning","author":"Wagenmaker","year":"2025","journal-title":"arXiv:2506.15799"},{"key":"ref265","article-title":"SLIM: Sim-to-real legged instructive manipulation via long-horizon visuomotor learning","author":"Zhang","year":"2025","journal-title":"arXiv:2501.09905"},{"key":"ref266","article-title":"Refined policy distillation: From VLA generalists to RL experts","author":"J\u00fclg","year":"2025","journal-title":"arXiv:2503.05833"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref268","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref269","article-title":"Cosmos world foundation model platform for physical AI","author":"Agarwal","year":"2025","journal-title":"arXiv:2501.03575"},{"key":"ref270","article-title":"Qwen2.5 technical report","volume-title":"arXiv:2412.15115","author":"Yang","year":"2024"},{"key":"ref271","article-title":"NORA: A small open-sourced generalist vision language action model for embodied tasks","author":"Hung","year":"2025","journal-title":"arXiv:2504.19854"},{"key":"ref272","article-title":"Interleave-VLA: Enhancing robot manipulation with interleaved image-text instructions","author":"Fan","year":"2025","journal-title":"arXiv:2505.02152"},{"key":"ref273","first-page":"1","article-title":"CombatVLA: An efficient vision-language-action model for combat tasks in 3D action role-playing games","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis. (ICCV)","author":"Chen"},{"key":"ref274","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Liu"},{"key":"ref275","article-title":"Unveiling the potential of vision-language-action models with open-ended multimodal instructions","author":"Zhao","year":"2025","journal-title":"arXiv:2505.11214"},{"key":"ref276","volume-title":"Introducing Gemini 2.0: Our New AI Model for the Agentic Era","author":"DeepMind","year":"2024"},{"key":"ref277","article-title":"Gemini robotics: Bringing AI into the physical world","author":"Team","year":"2025","journal-title":"arXiv:2503.20020"},{"key":"ref278","volume-title":"Introducing Our Multimodal Models","author":"Bavishi","year":"2023"},{"key":"ref279","article-title":"QUAR-VLA: Vision-language-action model for quadruped robots","author":"Ding","year":"2023","journal-title":"arXiv:2312.14457"},{"key":"ref280","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11128601"},{"key":"ref281","first-page":"56619","article-title":"Deer-VLA: Dynamic inference of multimodal large language models for efficient robot execution","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","volume":"37","author":"Yue"},{"key":"ref282","article-title":"The llama 3 herd of models","author":"Grattafiori","year":"2024","journal-title":"arXiv:2407.21783"},{"key":"ref283","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2025.XXI.076"},{"key":"ref284","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.521"},{"key":"ref285","article-title":"Tracevla: Visual trace prompting enhances spatial\u2013temporal awareness for generalist robotic policies","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Zheng"},{"key":"ref286","article-title":"UP-VLA: A unified understanding and prediction model for embodied agent","author":"Zhang","year":"2025","journal-title":"arXiv:2501.18867"},{"key":"ref287","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00018"},{"key":"ref288","doi-asserted-by":"publisher","DOI":"10.1109\/HRI61500.2025.10974117"},{"key":"ref289","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"ref290","article-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling","author":"Chen","year":"2024","journal-title":"arXiv:2412.05271"},{"key":"ref291","article-title":"Eagle 2: Building post-training data strategies from scratch for frontier vision-language models","author":"Li","year":"2025","journal-title":"arXiv:2501.14818"},{"key":"ref292","article-title":"Chameleon: Mixed-modal early-fusion foundation models","author":"Team","year":"2024","journal-title":"arXiv:2405.09818"},{"key":"ref293","article-title":"Knowledge insulating vision-language-action models: Train fast, run fast, generalize better","author":"Driess","year":"2025","journal-title":"arXiv:2505.23705"},{"key":"ref294","article-title":"ReVLA: Reverting visual domain limitation of robotic foundation models","author":"Dey","year":"2024","journal-title":"arXiv:2409.15250"},{"key":"ref295","first-page":"145","article-title":"Remix: Optimizing data mixtures for large scale imitation learning","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","volume":"270","author":"Hejna"},{"key":"ref296","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Hu"},{"key":"ref297","article-title":"BitVLA: 1-bit vision-language-action models for robotics manipulation","author":"Wang","year":"2025","journal-title":"arXiv:2506.07530"},{"key":"ref298","article-title":"Real-time execution of action chunking flow policies","author":"Black","year":"2025","journal-title":"arXiv:2506.07339"},{"key":"ref299","article-title":"Deer-VLA: Dynamic inference of multimodal large language models for efficient robot execution","author":"Yue","year":"2024","journal-title":"arXiv:2411.02359"},{"key":"ref300","article-title":"VLA-cache: Towards efficient vision-language-action model via adaptive token caching in robotic manipulation","author":"Xu","year":"2025","journal-title":"arXiv:2502.02175"},{"key":"ref301","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.016"},{"key":"ref302","first-page":"4066","article-title":"Mobile aloha: Learning bimanual mobile manipulation using low-cost whole-body teleoperation","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","volume":"270","author":"Fu"},{"key":"ref303","article-title":"ALOHA 2: An enhanced low-cost hardware for bimanual teleoperation","author":"Aldaco","year":"2024","journal-title":"arXiv:2405.02292"},{"key":"ref304","first-page":"1910","article-title":"ALOHA unleashed: A simple recipe for robot dexterity","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","author":"Zhao"},{"key":"ref305","doi-asserted-by":"publisher","DOI":"10.1109\/AIM55361.2024.10637173"},{"key":"ref306","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10801581"},{"key":"ref307","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.015"},{"key":"ref308","article-title":"MediaPipe hands: On-device real-time hand tracking","author":"Zhang","year":"2020","journal-title":"arXiv:2006.10214"},{"key":"ref309","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160765"},{"key":"ref310","first-page":"4895","article-title":"Ace: A cross-platform and visual-exoskeletons system for low-cost dexterous teleoperation","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","volume":"270","author":"Yang"},{"key":"ref311","first-page":"2729","article-title":"Open-TeleVision: Teleoperation with immersive active visual feedback","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","author":"Cheng"},{"key":"ref312","article-title":"Bunny-VisionPro: Real-time bimanual dexterous teleoperation for imitation learning","author":"Ding","year":"2024","journal-title":"arXiv:2407.03162"},{"key":"ref313","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.045"},{"key":"ref314","article-title":"A careful examination of large behavior models for multitask dexterous manipulation","author":"Barreiros","year":"2025","journal-title":"arXiv:2507.05331"},{"key":"ref315","article-title":"Dexumi: Using human hand as the universal manipulation interface for dexterous manipulation","volume-title":"Proc. 3rd RSS Workshop Dexterous Manipulation, Learn. Control Diverse Data","author":"Xu"},{"key":"ref316","article-title":"On bringing robots home","author":"Shafiullah","year":"2023","journal-title":"arXiv:2311.16098"},{"key":"ref317","article-title":"Robot utility models: General policies for zero-shot deployment in new environments","author":"Etukuru","year":"2024","journal-title":"arXiv:2409.05865"},{"key":"ref318","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.043"},{"key":"ref319","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2025.XXI.075"},{"key":"ref320","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01834"},{"key":"ref321","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00662"},{"key":"ref322","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02226"},{"key":"ref323","article-title":"Aria everyday activities dataset","author":"Lv","year":"2024","journal-title":"arXiv:2402.13349"},{"key":"ref324","article-title":"EgoMimic: Scaling imitation learning via egocentric video","author":"Kareer","year":"2024","journal-title":"arXiv:2410.24221"},{"key":"ref325","article-title":"EgoZero: Robot learning from smart glasses","author":"Liu","year":"2025","journal-title":"arXiv:2505.20290"},{"key":"ref326","article-title":"Humanoid policy human policy","author":"Qiu","year":"2025","journal-title":"arXiv:2503.13441"},{"key":"ref327","first-page":"879","article-title":"RoboTurk: A crowdsourcing platform for robotic skill learning through imitation","volume-title":"Proc. 2nd Conf. Robot Learn. (CoRL)","author":"Mandlekar"},{"key":"ref328","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-35142-6_14"},{"key":"ref329","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3295255"},{"key":"ref330","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.120"},{"key":"ref331","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.695"},{"key":"ref332","first-page":"4158","article-title":"Scaling robot policy learning via zero-shot labeling with foundation models","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","volume":"270","author":"Blank"},{"key":"ref333","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2025.xxi.152"},{"key":"ref334","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Team","year":"2024","journal-title":"arXiv:2403.05530"},{"key":"ref335","doi-asserted-by":"crossref","DOI":"10.1109\/ICRA57147.2024.10611615","article-title":"Rh20t: A robotic dataset for learning diverse skills in one-shot","volume-title":"Proc. RSS Workshop Learn. Task Motion Planning","author":"Fang"},{"key":"ref336","first-page":"651","article-title":"Scalable deep reinforcement learning for vision-based robotic manipulation","volume-title":"Proc. 2nd Conf. Robot Learn. (CoRL)","volume":"87","author":"Kalashnikov"},{"key":"ref337","article-title":"MT-opt: Continuous multi-task robotic reinforcement learning at scale","author":"Kalashnikov","year":"2021","journal-title":"arXiv:2104.08212"},{"key":"ref338","first-page":"885","article-title":"RoboNet: Large-scale multi-robot learning","volume-title":"Proc. Conf. Robot Learn. (CoRL)","author":"Dasari"},{"key":"ref339","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2022.XVIII.063"},{"key":"ref340","first-page":"1723","article-title":"BridgeData v2: A dataset for robot learning at scale","volume-title":"Proc. 7th Conf. Robot Learn. (CoRL)","author":"Walke"},{"key":"ref341","first-page":"991","article-title":"BC-Z: Zero-shot task generalization with robotic imitation learning","volume-title":"Proc. 5th Conf. Robot Learn. (CoRL)","author":"Jang"},{"key":"ref342","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3180108"},{"key":"ref343","first-page":"44776","article-title":"LIBERO: Benchmarking knowledge transfer for lifelong robot learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Liu"},{"key":"ref344","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02034"},{"key":"ref345","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00050"},{"key":"ref346","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00998"},{"key":"ref347","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01244"},{"key":"ref348","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02033"},{"key":"ref349","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"ref350","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref351","article-title":"A short note on the kinetics-700 human action dataset","author":"Carreira","year":"2019","journal-title":"arXiv:1907.06987"},{"key":"ref352","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2012.6386109"},{"key":"ref353","first-page":"1820","article-title":"MimicGen: A data generation system for scalable robot learning using human demonstrations","volume-title":"Proc. 7th Conf. Robot Learn. (CoRL)","author":"Mandlekar"},{"key":"ref354","article-title":"DexMimicGen: Automated data generation for bimanual dexterous manipulation via imitation learning","author":"Jiang","year":"2024","journal-title":"arXiv:2410.24185"},{"key":"ref355","first-page":"906","article-title":"Multiple interactions made easy (MIME): Large scale demonstrations data for imitation","volume-title":"Proc. 2nd Conf. Robot Learn. (CoRL)","author":"Sharma"},{"key":"ref356","article-title":"RLDS: An ecosystem to generate, share and use datasets in reinforcement learning","author":"Ramos","year":"2021","journal-title":"arXiv:2111.02767"},{"key":"ref357","first-page":"1838","article-title":"Latent plans for task-agnostic offline reinforcement learning","volume-title":"Proc. 6th Conf. Robot Learn. (CoRL)","author":"Rosete-Beas"},{"key":"ref358","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160396"},{"key":"ref359","volume-title":"Clvr Jaco Play Dataset","author":"Dass","year":"2023"},{"key":"ref360","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2024.3353075"},{"key":"ref361","volume-title":"Berkeley UR5 Demonstration Dataset","author":"Chen","year":"2025"},{"key":"ref362","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160594"},{"key":"ref363","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611293"},{"key":"ref364","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3329626"},{"key":"ref365","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3184025"},{"key":"ref366","first-page":"674","article-title":"Rapid exploration for open-world navigation with latent goal models","volume-title":"Proc. 5th Conf. Robot Learn. (CoRL)","author":"Shah"},{"key":"ref367","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"ref368","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610216"},{"key":"ref369","article-title":"CACTI: A framework for scalable multi-task multi-scene visual imitation learning","volume-title":"Proc. CoRL Workshop Pre-Training Robot Learn.","author":"Mandi"},{"key":"ref370","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.010"},{"key":"ref371","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.027"},{"key":"ref372","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01761"},{"key":"ref373","article-title":"Run-time observation interventions make vision-language-action models more visually robust","author":"Hancock","year":"2024","journal-title":"arXiv:2410.01971"},{"key":"ref374","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.029"},{"key":"ref375","first-page":"627","article-title":"A reduction of imitation learning and structured prediction to no-regret online learning","volume-title":"Proc. 14th Int. Conf. Artif. Intell. Statist.","author":"Ross"},{"key":"ref376","article-title":"CCIL: Continuity-based data augmentation for corrective imitation learning","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Ke"},{"key":"ref377","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9340700"},{"key":"ref378","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.099"},{"key":"ref379","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.089"},{"key":"ref380","doi-asserted-by":"publisher","DOI":"10.1109\/HRI61500.2025.10973961"},{"key":"ref381","article-title":"Robosuite: A modular simulation framework and benchmark for robot learning","author":"Zhu","year":"2020","journal-title":"arXiv:2009.12293"},{"key":"ref382","first-page":"1678","article-title":"What matters in learning from offline human demonstrations for robot manipulation","volume-title":"Proc. 5th Conf. Robot Learn. (CoRL)","author":"Mandlekar"},{"key":"ref383","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.050"},{"key":"ref384","first-page":"1094","article-title":"Meta-world: A benchmark and evaluation for multi-task and meta reinforcement learning","volume-title":"Proc. Conf. Robot Learn. (CoRL)","author":"Yu"},{"key":"ref385","article-title":"LeVERB: Humanoid whole-body control with latent vision-language instruction","author":"Xue","year":"2025","journal-title":"arXiv:2506.13751"},{"key":"ref386","article-title":"ManiSkill: Generalizable manipulation skill benchmark with large-scale demonstrations","volume-title":"Proc. Neural Inf. Process. Syst. Track Datasets Benchmarks","author":"Mu"},{"key":"ref387","article-title":"ManiSkill2: A unified benchmark for generalizable manipulation skills","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Gu"},{"key":"ref388","article-title":"ManiSkill3: GPU parallelized robotics simulation and rendering for generalizable embodied AI","author":"Tao","year":"2024","journal-title":"arXiv:2410.00425"},{"key":"ref389","article-title":"Maniskill-hab: A benchmark for low-level manipulation in home rearrangement tasks","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Shukla"},{"key":"ref390","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02575"},{"key":"ref391","article-title":"RoboTwin 2.0: A scalable data generator and benchmark with strong domain randomization for robust bimanual robotic manipulation","author":"Chen","year":"2025","journal-title":"arXiv:2506.18088"},{"key":"ref392","article-title":"LoHoRavens: A long-horizon language-conditioned benchmark for robotic tabletop manipulation","author":"Zhang","year":"2023","journal-title":"arXiv:2310.12020"},{"key":"ref393","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00943"},{"key":"ref394","first-page":"251","article-title":"Habitat 2.0: Training home assistants to rearrange their habitat","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Szot"},{"key":"ref395","article-title":"Habitat 3.0: A co-habitat for humans, avatars and robots","author":"Puig","year":"2023","journal-title":"arXiv:2310.13724"},{"key":"ref396","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2974707"},{"key":"ref397","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.133"},{"key":"ref398","article-title":"AI2-THOR: An interactive 3D environment for visual AI","author":"Kolve","year":"2017","journal-title":"arXiv:1712.05474"},{"key":"ref399","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01537"},{"key":"ref400","first-page":"3705","article-title":"Evaluating real-world robot manipulation policies in simulation","volume-title":"Proc. 8th Conf. Robot Learn. (CoRL)","author":"Li"},{"key":"ref401","article-title":"RoboArena: Distributed real-world evaluation of generalist robot policies","author":"Atreya","year":"2025","journal-title":"arXiv:2506.18123"},{"key":"ref402","article-title":"Libero: Benchmarking knowledge transfer for lifelong robot learning","author":"Liu","year":"2023","journal-title":"arXiv:2306.03310"},{"key":"ref403","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3270034"},{"key":"ref404","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01111"},{"key":"ref405","first-page":"1","article-title":"Robocas: A benchmark for robotic manipulation in complex object arrangement scenarios","volume-title":"Proc. NeurIPS","author":"Zheng"},{"key":"ref406","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02030"},{"key":"ref407","article-title":"Pybullet, a Python module for physics simulation for games, robotics and machine learning","author":"Coumans","year":"2016"},{"key":"ref408","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2013.6696520"},{"key":"ref409","article-title":"PyRep: Bringing V-REP to deep robot learning","author":"James","year":"2019","journal-title":"arXiv:1906.11176"},{"key":"ref410","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00323"},{"key":"ref411","first-page":"5982","article-title":"ProcTHOR: Large-scale embodied AI using procedural generation","volume-title":"Proc. Adv. Neural Inf. Process. Syst. (NeurIPS)","author":"Deitke"},{"key":"ref412","doi-asserted-by":"publisher","DOI":"10.1145\/3729343"},{"key":"ref413","article-title":"Exploring the adversarial vulnerabilities of vision-language-action models in robotics","author":"Wang","year":"2024","journal-title":"arXiv:2411.13587"},{"key":"ref414","article-title":"Manipulation facing threats: Evaluating physical vulnerabilities in end-to-end vision language action models","author":"Cheng","year":"2024","journal-title":"arXiv:2409.13174"},{"key":"ref415","doi-asserted-by":"publisher","DOI":"10.1109\/ICAD65464.2025.11114062"},{"key":"ref416","article-title":"Mobility VLA: Multimodal instruction navigation with long-context VLMs and topological graphs","author":"Chiang","year":"2024","journal-title":"arXiv:2407.07775"},{"key":"ref417","article-title":"RaceVLA: VLA-based racing drone navigation with human-like behaviour","author":"Serpiva","year":"2025","journal-title":"arXiv:2503.02572"},{"key":"ref418","article-title":"CognitiveDrone: A VLA model and evaluation benchmark for real-time cognitive task solving and reasoning in UAVs","author":"Lykov","year":"2025","journal-title":"arXiv:2503.01378"},{"key":"ref419","article-title":"Scaling cross-embodied learning: One policy for manipulation, navigation, locomotion and aviation","author":"Doshi","year":"2024","journal-title":"arXiv:2408.11812"},{"key":"ref420","article-title":"Tracevla: Visual trace prompting enhances spatial\u2013temporal awareness for generalist robotic policies","author":"Zheng","year":"2024","journal-title":"arXiv:2412.10345"},{"key":"ref421","article-title":"EgoVLA: Learning vision-language-action models from egocentric human videos","author":"Yang","year":"2025","journal-title":"arXiv:2507.12440"},{"key":"ref422","article-title":"Is single-view mesh reconstruction ready for robotics?","author":"Nolte","year":"2025","journal-title":"arXiv:2505.17966"},{"key":"ref423","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2021.xvii.026"},{"key":"ref424","article-title":"Grasp-MPC: Closed-loop visual grasping via value-guided model predictive control","author":"Yamada","year":"2025","journal-title":"arXiv:2509.06201"},{"key":"ref425","article-title":"SAFE: Multitask failure detection for vision-language-action models","author":"Gu","year":"2025","journal-title":"arXiv:2506.09937"},{"key":"ref426","article-title":"Agentic robot: A brain-inspired framework for vision-language-action models in embodied agents","author":"Yang","year":"2025","journal-title":"arXiv:2505.23450"},{"key":"ref427","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-025-08744-2"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/10820123\/11164279.pdf?arnumber=11164279","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,30]],"date-time":"2025-09-30T13:10:03Z","timestamp":1759237803000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11164279\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":427,"URL":"https:\/\/doi.org\/10.1109\/access.2025.3609980","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}