{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:08:06Z","timestamp":1750219686871,"version":"3.41.0"},"reference-count":260,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Robotics and Computer-Integrated Manufacturing"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1016\/j.rcim.2025.103064","type":"journal-article","created":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T10:09:02Z","timestamp":1749636542000},"page":"103064","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Empowering natural human\u2013robot collaboration through multimodal language models and spatial intelligence: Pathways and perspectives"],"prefix":"10.1016","volume":"97","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9052-6468","authenticated-orcid":false,"given":"Duidi","family":"Wu","sequence":"first","affiliation":[]},{"given":"Pai","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Qianyou","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Shuo","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Jin","family":"Qi","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Guo-Niu","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Lihui","family":"Wang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.rcim.2025.103064_b1","doi-asserted-by":"crossref","first-page":"612","DOI":"10.1016\/j.jmsy.2022.02.001","article-title":"Outlook on human-centric manufacturing towards Industry 5.0","volume":"62","author":"Lu","year":"2022","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b2","doi-asserted-by":"crossref","first-page":"199","DOI":"10.1016\/j.jmsy.2021.11.001","article-title":"A futuristic perspective on human-centric assembly","volume":"62","author":"Wang","year":"2022","journal-title":"J. Manuf. Syst."},{"issue":"3","key":"10.1016\/j.rcim.2025.103064_b3","doi-asserted-by":"crossref","DOI":"10.1002\/aisy.202300359","article-title":"Multimodal Human\u2013Robot interaction for Human-Centric smart manufacturing: A survey","volume":"6","author":"Wang","year":"2024","journal-title":"Adv. Intell. Syst."},{"issue":"18","key":"10.1016\/j.rcim.2025.103064_b4","doi-asserted-by":"crossref","first-page":"10355","DOI":"10.1109\/JSEN.2020.2995271","article-title":"Progress and prospects of multimodal fusion methods in physical Human\u2013Robot interaction: A review","volume":"20","author":"Xue","year":"2020","journal-title":"IEEE Sens. J."},{"key":"10.1016\/j.rcim.2025.103064_b5","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2022.102510","article-title":"Proactive human\u2013robot collaboration: Mutual-cognitive, predictable, and self-organising perspectives","volume":"81","author":"Li","year":"2023","journal-title":"Robot. Comput.-Integr. Manuf."},{"key":"10.1016\/j.rcim.2025.103064_b6","doi-asserted-by":"crossref","first-page":"74762","DOI":"10.1109\/ACCESS.2018.2884793","article-title":"Towards robust Human-Robot collaborative manufacturing: Multimodal fusion","volume":"6","author":"Liu","year":"2018","journal-title":"IEEE Access"},{"key":"10.1016\/j.rcim.2025.103064_b7","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2023.102610","article-title":"Cognitive neuroscience and robotics: Advancements and future research directions","volume":"85","author":"Liu","year":"2024","journal-title":"Robot. Comput.-Integr. Manuf."},{"year":"2024","author":"Dai","series-title":"Automated creation of digital cousins for robust policy learning","key":"10.1016\/j.rcim.2025.103064_b8"},{"year":"2023","author":"Firoozi","series-title":"Foundation models in robotics: Applications, challenges, and the future","key":"10.1016\/j.rcim.2025.103064_b9"},{"year":"2023","author":"Xiao","series-title":"Robot learning in the era of foundation models: A survey","key":"10.1016\/j.rcim.2025.103064_b10"},{"year":"2023","author":"Zeng","series-title":"Large language models for robotics: A survey","key":"10.1016\/j.rcim.2025.103064_b11"},{"year":"2023","author":"Hu","series-title":"Toward general-purpose robots via foundation models: A survey and meta-analysis","key":"10.1016\/j.rcim.2025.103064_b12"},{"year":"2024","author":"Kawaharazuka","series-title":"Real-world robot applications of foundation models: A review","key":"10.1016\/j.rcim.2025.103064_b13"},{"year":"2024","author":"Durante","series-title":"Agent AI: Surveying the horizons of multimodal interaction","key":"10.1016\/j.rcim.2025.103064_b14"},{"year":"2023","author":"Huang","series-title":"VoxPoser: Composable 3D value maps for robotic manipulation with language models","key":"10.1016\/j.rcim.2025.103064_b15"},{"year":"2023","author":"Zhou","series-title":"A comprehensive survey on pretrained foundation models: A history from BERT to ChatGPT","key":"10.1016\/j.rcim.2025.103064_b16"},{"year":"2024","series-title":"GPT-4 technical report","key":"10.1016\/j.rcim.2025.103064_b17"},{"year":"2024","series-title":"Gemini: A family of highly capable multimodal models","key":"10.1016\/j.rcim.2025.103064_b18"},{"year":"2022","author":"Ouyang","series-title":"Training language models to follow instructions with human feedback","key":"10.1016\/j.rcim.2025.103064_b19"},{"year":"2023","author":"Touvron","series-title":"LLaMA: Open and efficient foundation language models","key":"10.1016\/j.rcim.2025.103064_b20"},{"year":"2023","author":"Taori","series-title":"Stanford alpaca: An instruction-following LLaMA model","key":"10.1016\/j.rcim.2025.103064_b21"},{"year":"2023","author":"Chiang","series-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality","key":"10.1016\/j.rcim.2025.103064_b22"},{"year":"2023","author":"Driess","series-title":"Palm-E: An embodied multimodal language model","key":"10.1016\/j.rcim.2025.103064_b23"},{"year":"2021","author":"Dosovitskiy","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","key":"10.1016\/j.rcim.2025.103064_b24"},{"year":"2021","author":"Touvron","series-title":"Training data-efficient image transformers & distillation through attention","key":"10.1016\/j.rcim.2025.103064_b25"},{"year":"2021","author":"Wang","series-title":"Pyramid vision transformer: A versatile backbone for dense prediction without convolutions","key":"10.1016\/j.rcim.2025.103064_b26"},{"year":"2021","author":"Yuan","series-title":"Florence: A new foundation model for computer vision","key":"10.1016\/j.rcim.2025.103064_b27"},{"year":"2021","author":"Cho","series-title":"Unifying vision-and-language tasks via text generation","key":"10.1016\/j.rcim.2025.103064_b28"},{"year":"2022","author":"Alayrac","series-title":"Flamingo: a visual language model for few-shot learning","key":"10.1016\/j.rcim.2025.103064_b29"},{"year":"2023","author":"Brohan","series-title":"RT-1: Robotics Transformer for Real-World Control at Scale","key":"10.1016\/j.rcim.2025.103064_b30"},{"year":"2022","author":"Jiang","series-title":"VIMA: General robot manipulation with multimodal prompts","key":"10.1016\/j.rcim.2025.103064_b31"},{"year":"2023","series-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","key":"10.1016\/j.rcim.2025.103064_b32"},{"year":"2024","author":"Ding","series-title":"QUAR-VLA: Vision-language-action model for quadruped robots","key":"10.1016\/j.rcim.2025.103064_b33"},{"year":"2024","author":"Arai","series-title":"CoVla: Comprehensive vision-language-action dataset for autonomous driving","key":"10.1016\/j.rcim.2025.103064_b34"},{"year":"2023","author":"Liu","series-title":"Visual instruction tuning","key":"10.1016\/j.rcim.2025.103064_b35"},{"year":"2021","author":"Radford","series-title":"Learning transferable visual models from natural language supervision","key":"10.1016\/j.rcim.2025.103064_b36"},{"year":"2019","author":"Lu","series-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","key":"10.1016\/j.rcim.2025.103064_b37"},{"year":"2022","author":"Li","series-title":"Grounded language-image pre-training","key":"10.1016\/j.rcim.2025.103064_b38"},{"year":"2022","author":"Rombach","series-title":"High-resolution image synthesis with latent diffusion models","key":"10.1016\/j.rcim.2025.103064_b39"},{"year":"2024","author":"Zhan","series-title":"AnyGPT: Unified multimodal LLM with discrete sequence modeling","key":"10.1016\/j.rcim.2025.103064_b40"},{"year":"2024","author":"Wu","series-title":"NExT-GPT: Any-to-Any multimodal LLM","key":"10.1016\/j.rcim.2025.103064_b41"},{"year":"2024","author":"Wang","series-title":"ModaVerse: Efficiently transforming modalities with LLMs","key":"10.1016\/j.rcim.2025.103064_b42"},{"year":"2024","author":"Fu","series-title":"A touch, vision, and language dataset for multimodal alignment","key":"10.1016\/j.rcim.2025.103064_b43"},{"key":"10.1016\/j.rcim.2025.103064_b44","doi-asserted-by":"crossref","first-page":"1188","DOI":"10.1016\/j.procir.2020.03.022","article-title":"Symbiotic human-robot collaboration: multimodal control using function blocks","volume":"93","author":"Liu","year":"2020","journal-title":"Procedia CIRP"},{"key":"10.1016\/j.rcim.2025.103064_b45","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2022.102359","article-title":"An electromyography signals-based human-robot collaboration system for human motion intention recognition and realization","volume":"77","author":"Zhang","year":"2022","journal-title":"Robot. Comput.-Integr. Manuf."},{"year":"2021","author":"Wang","series-title":"ActionCLIP: A new paradigm for video action recognition","key":"10.1016\/j.rcim.2025.103064_b46"},{"issue":"2","key":"10.1016\/j.rcim.2025.103064_b47","doi-asserted-by":"crossref","first-page":"392","DOI":"10.1007\/s11263-023-01876-w","article-title":"Transferring Vision-Language models for visual recognition: A classifier perspective","volume":"132","author":"Wu","year":"2024","journal-title":"Int. J. Comput. Vis."},{"year":"2024","author":"Ding","series-title":"AToM-Bot: Embodied fulfillment of unspoken human needs with affective theory of mind","key":"10.1016\/j.rcim.2025.103064_b48"},{"year":"2024","author":"Huang","series-title":"LIT: Large language model driven intention tracking for proactive human-robot collaboration \u2013 A robot Sous-Chef application","key":"10.1016\/j.rcim.2025.103064_b49"},{"year":"2022","author":"Zhang","series-title":"MotionDiffuse: Text-Driven human motion generation with diffusion model","key":"10.1016\/j.rcim.2025.103064_b50"},{"year":"2022","author":"Tevet","series-title":"MotionCLIP: Exposing human motion generation to CLIP space","key":"10.1016\/j.rcim.2025.103064_b51"},{"year":"2023","author":"Jiang","series-title":"MotionGPT: Human motion as a foreign language","key":"10.1016\/j.rcim.2025.103064_b52"},{"year":"2021","author":"Kamath","series-title":"MDETR \u2013 Modulated detection for End-to-End multi-modal understanding","key":"10.1016\/j.rcim.2025.103064_b53"},{"year":"2022","author":"Minderer","series-title":"Simple open-vocabulary object detection with vision transformers","key":"10.1016\/j.rcim.2025.103064_b54"},{"year":"2021","author":"Caron","series-title":"Emerging properties in Self-Supervised vision transformers","key":"10.1016\/j.rcim.2025.103064_b55"},{"year":"2024","author":"Liu","series-title":"Grounding DINO: Marrying DINO with grounded Pre-Training for Open-Set object detection","key":"10.1016\/j.rcim.2025.103064_b56"},{"year":"2023","author":"Zhai","series-title":"Sigmoid loss for language image Pre-Training","key":"10.1016\/j.rcim.2025.103064_b57"},{"year":"2023","author":"Yu","series-title":"Scaling robot learning with semantically imagined experience","key":"10.1016\/j.rcim.2025.103064_b58"},{"year":"2024","author":"Gao","series-title":"Physically grounded vision-language models for robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b59"},{"year":"2023","author":"Hu","series-title":"Look before you leap: Unveiling the power of GPT-4V in robotic vision-language planning","key":"10.1016\/j.rcim.2025.103064_b60"},{"year":"2023","author":"Stone","series-title":"Open-World object manipulation using Pre-trained vision-language models","key":"10.1016\/j.rcim.2025.103064_b61"},{"year":"2024","author":"Oquab","series-title":"DINOv2: Learning robust visual features without supervision","key":"10.1016\/j.rcim.2025.103064_b62"},{"year":"2023","author":"Kirillov","series-title":"Segment anything","key":"10.1016\/j.rcim.2025.103064_b63"},{"year":"2023","author":"Huang","series-title":"Instruct2Act: Mapping multi-modality instructions to robotic actions with large language model","key":"10.1016\/j.rcim.2025.103064_b64"},{"year":"2024","author":"Huang","series-title":"CoPa: General robotic manipulation through spatial constraints of parts with foundation models","key":"10.1016\/j.rcim.2025.103064_b65"},{"year":"2024","author":"Ren","series-title":"Grounded SAM: Assembling open-world models for diverse visual tasks","key":"10.1016\/j.rcim.2025.103064_b66"},{"year":"2024","author":"Liu","series-title":"MOKA: Open-Vocabulary robotic manipulation through Mark-Based visual prompting","key":"10.1016\/j.rcim.2025.103064_b67"},{"year":"2024","author":"Huang","series-title":"Rekep: Spatio-Temporal reasoning of relational keypoint constraints for robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b68"},{"year":"2024","author":"Kuang","series-title":"RAM: Retrieval-based affordance transfer for generalizable Zero-Shot robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b69"},{"year":"2023","author":"Guo","series-title":"Point-bind & point-LLM: Aligning point cloud with Multi-modality for 3D understanding, generation, and instruction following","key":"10.1016\/j.rcim.2025.103064_b70"},{"year":"2023","author":"Xu","series-title":"PointLLM: Empowering large language models to understand point clouds","key":"10.1016\/j.rcim.2025.103064_b71"},{"year":"2024","author":"Chen","series-title":"SpatialVLM: Endowing vision-language models with spatial reasoning capabilities","key":"10.1016\/j.rcim.2025.103064_b72"},{"year":"2022","author":"Shah","series-title":"LM-Nav: Robotic navigation with large pre-trained models of language, vision, and action","key":"10.1016\/j.rcim.2025.103064_b73"},{"year":"2022","author":"Chen","series-title":"Open-vocabulary queryable scene representations for real world planning","key":"10.1016\/j.rcim.2025.103064_b74"},{"year":"2024","author":"Zhang","series-title":"Navid: Video-based VLM plans the next step for vision-and-language navigation","key":"10.1016\/j.rcim.2025.103064_b75"},{"year":"2024","author":"Wang","series-title":"VLM see, robot do: Human demo video to robot action plan via vision language model","key":"10.1016\/j.rcim.2025.103064_b76"},{"key":"10.1016\/j.rcim.2025.103064_b77","doi-asserted-by":"crossref","first-page":"3897","DOI":"10.1109\/TRO.2024.3420722","article-title":"Learning Human-Like functional grasping for multifinger hands from few demonstrations","volume":"40","author":"Wei","year":"2024","journal-title":"IEEE Trans. Robot."},{"year":"2024","author":"Li","series-title":"OKAMI: Teaching humanoid robots manipulation skills through single video imitation","key":"10.1016\/j.rcim.2025.103064_b78"},{"year":"2021","author":"Shridhar","series-title":"CLIPort: What and where pathways for robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b79"},{"year":"2022","author":"Ahn","series-title":"Do as I can, not as I say: Grounding language in robotic affordances","key":"10.1016\/j.rcim.2025.103064_b80"},{"year":"2023","author":"Karamcheti","series-title":"Language-driven representation learning for robotics","key":"10.1016\/j.rcim.2025.103064_b81"},{"year":"2024","author":"Tang","series-title":"KALIE: Fine-tuning vision-language models for open-world manipulation without robot data","key":"10.1016\/j.rcim.2025.103064_b82"},{"year":"2024","author":"Yuan","series-title":"RoboPoint: A vision-language model for spatial affordance prediction for robotics","key":"10.1016\/j.rcim.2025.103064_b83"},{"year":"2024","author":"Srirama","series-title":"HRP: Human affordances for robotic pre-training","key":"10.1016\/j.rcim.2025.103064_b84"},{"year":"2023","author":"Mees","series-title":"Grounding language with visual affordances over unstructured data","key":"10.1016\/j.rcim.2025.103064_b85"},{"issue":"4","key":"10.1016\/j.rcim.2025.103064_b86","doi-asserted-by":"crossref","first-page":"3308","DOI":"10.1109\/LRA.2018.2852786","article-title":"Interactive Text2Pickup networks for natural language-based Human\u2013Robot collaboration","volume":"3","author":"Ahn","year":"2018","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.rcim.2025.103064_b87","series-title":"Affordances from Human Videos as a Versatile Representation for Robotics","first-page":"01","author":"Bahl","year":"2023"},{"year":"2019","author":"Manuelli","series-title":"kPAM: KeyPoint affordances for Category-Level robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b88"},{"year":"2023","author":"Huang","series-title":"Grounded decoding: Guiding text generation with grounded models for embodied agents","key":"10.1016\/j.rcim.2025.103064_b89"},{"issue":"8","key":"10.1016\/j.rcim.2025.103064_b90","doi-asserted-by":"crossref","first-page":"1345","DOI":"10.1007\/s10514-023-10131-7","article-title":"Text2Motion: from natural language instructions to feasible plans","volume":"47","author":"Lin","year":"2023","journal-title":"Auton. Robots"},{"year":"2022","author":"Huang","series-title":"Inner monologue: Embodied reasoning through planning with language models","key":"10.1016\/j.rcim.2025.103064_b91"},{"year":"2023","author":"Liang","series-title":"Code as policies: Language model programs for embodied control","key":"10.1016\/j.rcim.2025.103064_b92"},{"year":"2022","author":"Singh","series-title":"ProgPrompt: Generating situated robot task plans using large language models","key":"10.1016\/j.rcim.2025.103064_b93"},{"year":"2022","author":"Zeng","series-title":"Socratic models: Composing zero-shot multimodal reasoning with language","key":"10.1016\/j.rcim.2025.103064_b94"},{"year":"2023","author":"Vemprala","series-title":"ChatGPT for robotics: Design principles and model abilities","key":"10.1016\/j.rcim.2025.103064_b95"},{"year":"2023","author":"Jin","series-title":"RobotGPT: Robot manipulation learning from ChatGPT","key":"10.1016\/j.rcim.2025.103064_b96"},{"key":"10.1016\/j.rcim.2025.103064_b97","series-title":"How to Prompt Your Robot: A PromptBook for Manipulation Skills with Code as Policies","first-page":"4340","author":"Arenas","year":"2024"},{"issue":"8","key":"10.1016\/j.rcim.2025.103064_b98","doi-asserted-by":"crossref","first-page":"6904","DOI":"10.1109\/LRA.2024.3415931","article-title":"Enhancing the LLM-based robot manipulation through Human-Robot collaboration","volume":"9","author":"Liu","year":"2024","journal-title":"IEEE Robot. Autom. Lett."},{"year":"2024","author":"Mei","series-title":"ReplanVLM: Replanning robotic tasks with visual language models","key":"10.1016\/j.rcim.2025.103064_b99"},{"year":"2023","author":"Wu","series-title":"TidyBot: Personalized robot assistance with large language models","key":"10.1016\/j.rcim.2025.103064_b100"},{"key":"10.1016\/j.rcim.2025.103064_b101","series-title":"Task-Oriented Multi-Modal Question Answering For Collaborative Applications","first-page":"1426","author":"Tan","year":"2020"},{"key":"10.1016\/j.rcim.2025.103064_b102","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1080\/09544828.2023.2272555","article-title":"A question answering system for assembly process of wind turbines based on multi-modal knowledge graph and large language model","author":"Hu","year":"2023","journal-title":"J. Eng. Des."},{"year":"2022","author":"Zeng","series-title":"Transporter networks: Rearranging the visual world for robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b103"},{"year":"2024","author":"Hua","series-title":"GenSim2: Scaling robot data generation with Multi-modal and reasoning LLMs","key":"10.1016\/j.rcim.2025.103064_b104"},{"key":"10.1016\/j.rcim.2025.103064_b105","doi-asserted-by":"crossref","first-page":"1009","DOI":"10.1016\/j.jmsy.2024.05.003","article-title":"A vision-language-guided robotic action planning approach for ambiguity mitigation in human\u2013robot collaborative manufacturing","volume":"74","author":"Fan","year":"2024","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b106","article-title":"A vision-language-guided and deep reinforcement learning-enabled approach for unstructured human-robot collaborative manufacturing task fulfilment","author":"Zheng","year":"2024","journal-title":"Vis."},{"year":"2024","author":"Shirai","series-title":"Vision-language interpreter for robot task planning","key":"10.1016\/j.rcim.2025.103064_b107"},{"year":"2022","author":"Nair","series-title":"R3M: A universal visual representation for robot manipulation","key":"10.1016\/j.rcim.2025.103064_b108"},{"year":"2024","author":"Papagiannis","series-title":"R+x: Retrieval and execution from everyday human videos","key":"10.1016\/j.rcim.2025.103064_b109"},{"year":"2023","author":"Gu","series-title":"RT-Trajectory: Robotic task generalization via hindsight trajectory sketches","key":"10.1016\/j.rcim.2025.103064_b110"},{"year":"2024","author":"Bharadhwaj","series-title":"Gen2Act: Human video generation in novel scenarios enables generalizable robot manipulation","key":"10.1016\/j.rcim.2025.103064_b111"},{"year":"2024","author":"Kareer","series-title":"EgoMimic: Scaling imitation learning via egocentric video","key":"10.1016\/j.rcim.2025.103064_b112"},{"year":"2019","author":"Lynch","series-title":"Learning latent plans from play","key":"10.1016\/j.rcim.2025.103064_b113"},{"year":"2021","author":"Lynch","series-title":"Language conditioned imitation learning over unstructured data","key":"10.1016\/j.rcim.2025.103064_b114"},{"year":"2022","author":"Jang","series-title":"BC-Z: Zero-Shot task generalization with robotic imitation learning","key":"10.1016\/j.rcim.2025.103064_b115"},{"year":"2022","author":"Lynch","series-title":"Interactive language: Talking to robots in real time","key":"10.1016\/j.rcim.2025.103064_b116"},{"year":"2021","author":"Mees","series-title":"CALVIN: A benchmark for language-conditioned policy learning for long-horizon robot manipulation tasks","key":"10.1016\/j.rcim.2025.103064_b117"},{"year":"2022","author":"Mees","series-title":"What matters in language conditioned robotic imitation learning over unstructured data","key":"10.1016\/j.rcim.2025.103064_b118"},{"year":"2021","author":"Florence","series-title":"Implicit behavioral cloning","key":"10.1016\/j.rcim.2025.103064_b119"},{"year":"2022","author":"Shafiullah","series-title":"Behavior transformers: Cloning $k$ modes with one stone","key":"10.1016\/j.rcim.2025.103064_b120"},{"year":"2023","author":"Zhao","series-title":"Learning fine-grained bimanual manipulation with low-cost hardware","key":"10.1016\/j.rcim.2025.103064_b121"},{"year":"2023","author":"Wu","series-title":"Unleashing large-scale video generative pre-training for visual robot manipulation","key":"10.1016\/j.rcim.2025.103064_b122"},{"year":"2023","author":"Black","series-title":"Zero-shot robotic manipulation with pretrained image-editing diffusion models","key":"10.1016\/j.rcim.2025.103064_b123"},{"year":"2023","author":"Myers","series-title":"Goal representations for instruction following: A semi-supervised language interface to control","key":"10.1016\/j.rcim.2025.103064_b124"},{"year":"2023","author":"Chane-Sane","series-title":"Learning video-conditioned policies for unseen manipulation tasks","key":"10.1016\/j.rcim.2025.103064_b125"},{"year":"2023","author":"Di Palo","series-title":"Towards a unified agent with foundation models","key":"10.1016\/j.rcim.2025.103064_b126"},{"year":"2023","author":"Yu","series-title":"Language to rewards for robotic skill synthesis","key":"10.1016\/j.rcim.2025.103064_b127"},{"year":"2023","author":"Rocamonde","series-title":"Vision-language models are Zero-Shot reward models for reinforcement learning","key":"10.1016\/j.rcim.2025.103064_b128"},{"year":"2023","author":"Song","series-title":"Self-refined large language model as automated reward function designer for deep reinforcement learning in robotics","key":"10.1016\/j.rcim.2025.103064_b129"},{"year":"2024","author":"Ma","series-title":"Eureka: Human-Level reward design via coding large language models","key":"10.1016\/j.rcim.2025.103064_b130"},{"year":"2024","author":"Xie","series-title":"Text2Reward: Reward shaping with language models for reinforcement learning","key":"10.1016\/j.rcim.2025.103064_b131"},{"year":"2023","author":"Ma","series-title":"LIV: Language-Image representations and rewards for robotic control","key":"10.1016\/j.rcim.2025.103064_b132"},{"year":"2021","author":"Chen","series-title":"Decision transformer: Reinforcement learning via sequence modeling","key":"10.1016\/j.rcim.2025.103064_b133"},{"year":"2023","author":"Chakraborty","series-title":"RE-MOVE: An adaptive policy design for robotic navigation tasks in dynamic environments via language-based feedback","key":"10.1016\/j.rcim.2025.103064_b134"},{"year":"2023","author":"Shi","series-title":"Unleashing the power of pre-trained language models for offline reinforcement learning","key":"10.1016\/j.rcim.2025.103064_b135"},{"issue":"12\u201314","key":"10.1016\/j.rcim.2025.103064_b136","doi-asserted-by":"crossref","first-page":"1419","DOI":"10.1177\/02783649211046285","article-title":"Concept2Robot: Learning manipulation concepts from instructions and human demonstrations","volume":"40","author":"Shao","year":"2021","journal-title":"Int. J. Robot. Res."},{"year":"2022","author":"Zhang","series-title":"Can offline reinforcement learning help natural language understanding?","key":"10.1016\/j.rcim.2025.103064_b137"},{"year":"2024","author":"Grigsby","series-title":"AMAGO: Scalable in-context reinforcement learning for adaptive agents","key":"10.1016\/j.rcim.2025.103064_b138"},{"year":"2024","author":"Li","series-title":"Auto MC-reward: Automated dense reward design with large language models for minecraft","key":"10.1016\/j.rcim.2025.103064_b139"},{"year":"2024","author":"Cao","series-title":"Survey on large language model-enhanced reinforcement learning: Concept, taxonomy, and methods","key":"10.1016\/j.rcim.2025.103064_b140"},{"year":"2023","author":"Chi","series-title":"Diffusion policy: Visuomotor policy learning via action diffusion","key":"10.1016\/j.rcim.2025.103064_b141"},{"year":"2024","author":"Ke","series-title":"3D diffuser actor: Policy diffusion with 3D scene representations","key":"10.1016\/j.rcim.2025.103064_b142"},{"year":"2024","author":"Wang","series-title":"Poco: Policy composition from and for heterogeneous robot learning","key":"10.1016\/j.rcim.2025.103064_b143"},{"year":"2024","author":"Reuss","series-title":"Multimodal diffusion transformer: Learning versatile behavior from multimodal goals","key":"10.1016\/j.rcim.2025.103064_b144"},{"year":"2024","author":"Huang","series-title":"ARDuP: Active region video diffusion for universal policies","key":"10.1016\/j.rcim.2025.103064_b145"},{"year":"2023","author":"Ha","series-title":"Scaling up and distilling down: Language-Guided robot skill acquisition","key":"10.1016\/j.rcim.2025.103064_b146"},{"year":"2024","author":"Team","series-title":"Octo: An open-source generalist robot policy","key":"10.1016\/j.rcim.2025.103064_b147"},{"year":"2024","author":"Zhao","series-title":"ALOHA unleashed: A simple recipe for robot dexterity","key":"10.1016\/j.rcim.2025.103064_b148"},{"year":"2024","author":"Liu","series-title":"RDT-1B: a diffusion foundation model for bimanual manipulation","key":"10.1016\/j.rcim.2025.103064_b149"},{"year":"2024","author":"Black","series-title":"$ _0$: A Vision-Language-Action flow model for general robot control","key":"10.1016\/j.rcim.2025.103064_b150"},{"year":"2024","author":"Zhou","series-title":"RoboDreamer: Learning compositional world models for robot imagination","key":"10.1016\/j.rcim.2025.103064_b151"},{"year":"2023","author":"Chebotar","series-title":"Q-Transformer: Scalable offline reinforcement learning via autoregressive Q-Functions","key":"10.1016\/j.rcim.2025.103064_b152"},{"year":"2024","author":"Belkhale","series-title":"RT-H: Action hierarchies using language","key":"10.1016\/j.rcim.2025.103064_b153"},{"year":"2024","author":"Gbagbe","series-title":"Bi-VLA: Vision-Language-Action Model-Based system for bimanual robotic dexterous manipulations","key":"10.1016\/j.rcim.2025.103064_b154"},{"year":"2024","author":"Kim","series-title":"OpenVLA: An open-source vision-language-action model","key":"10.1016\/j.rcim.2025.103064_b155"},{"year":"2024","author":"Wen","series-title":"TinyVLA: Towards fast, data-efficient vision-language-action models for robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b156"},{"year":"2024","author":"Li","series-title":"LLaRA: Supercharging robot learning data for vision-language policy","key":"10.1016\/j.rcim.2025.103064_b157"},{"year":"2024","author":"Li","series-title":"Vision-language foundation models as effective robot imitators","key":"10.1016\/j.rcim.2025.103064_b158"},{"year":"2024","author":"Liu","series-title":"RoboUniView: Visual-language model with unified view representation for robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b159"},{"year":"2024","author":"Liu","series-title":"RoboMamba: Multimodal state space model for efficient robot reasoning and manipulation","key":"10.1016\/j.rcim.2025.103064_b160"},{"year":"2024","author":"Duan","series-title":"Manipulate-Anything: Automating real-world robots using vision-language models","key":"10.1016\/j.rcim.2025.103064_b161"},{"key":"10.1016\/j.rcim.2025.103064_b162","series-title":"2024 IEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"7359","article-title":"Open6DOR: Benchmarking open-instruction 6-DoF object rearrangement and a VLM-based approach","author":"Ding","year":"2024"},{"year":"2024","author":"Zhen","series-title":"3D-VLA: A 3D vision-language-action generative world model","key":"10.1016\/j.rcim.2025.103064_b163"},{"year":"2024","author":"Cheang","series-title":"GR-2: A generative video-language-action model with web-scale knowledge for robot manipulation","key":"10.1016\/j.rcim.2025.103064_b164"},{"year":"2024","author":"Chi","series-title":"EVA: An embodied world model for future video anticipation","key":"10.1016\/j.rcim.2025.103064_b165"},{"year":"2024","author":"Zhang","series-title":"PIVOT-R: Primitive-driven waypoint-aware world model for robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b166"},{"year":"2024","author":"Zhou","series-title":"DINO-WM: World models on pre-trained visual features enable zero-shot planning","key":"10.1016\/j.rcim.2025.103064_b167"},{"year":"2024","author":"Zhang","series-title":"WHALE: Towards generalizable and scalable world models for embodied decision-making","key":"10.1016\/j.rcim.2025.103064_b168"},{"year":"2022","author":"Radosavovic","series-title":"Real-world robot learning with masked visual pre-training","key":"10.1016\/j.rcim.2025.103064_b169"},{"year":"2023","author":"Seo","series-title":"Masked world models for visual control","key":"10.1016\/j.rcim.2025.103064_b170"},{"year":"2023","author":"Liu","series-title":"Masked autoencoding for scalable and generalizable decision making","key":"10.1016\/j.rcim.2025.103064_b171"},{"year":"2023","author":"Radosavovic","series-title":"Robot learning with sensorimotor pre-training","key":"10.1016\/j.rcim.2025.103064_b172"},{"year":"2023","author":"Lin","series-title":"SpawnNet: Learning generalizable visuomotor skills from Pre-trained networks","key":"10.1016\/j.rcim.2025.103064_b173"},{"year":"2024","author":"Jiang","series-title":"Robots pre-train robots: Manipulation-centric robotic representation from large-scale robot datasets","key":"10.1016\/j.rcim.2025.103064_b174"},{"year":"2023","series-title":"Open X-Embodiment: Robotic learning datasets and RT-x models","key":"10.1016\/j.rcim.2025.103064_b175"},{"year":"2024","author":"Walke","series-title":"BridgeData V2: A dataset for robot learning at scale","key":"10.1016\/j.rcim.2025.103064_b176"},{"year":"2021","author":"Hu","series-title":"LoRA: Low-rank adaptation of large language models","key":"10.1016\/j.rcim.2025.103064_b177"},{"year":"2024","author":"Shentu","series-title":"From LLMs to actions: Latent codes as bridges in hierarchical robot control","key":"10.1016\/j.rcim.2025.103064_b178"},{"year":"2018","author":"Ha","series-title":"World models","key":"10.1016\/j.rcim.2025.103064_b179"},{"year":"2024","author":"Liang","series-title":"VisualPredicator: Learning abstract world models with Neuro-Symbolic predicates for robot planning","key":"10.1016\/j.rcim.2025.103064_b180"},{"year":"2022","author":"Reed","series-title":"A generalist agent","key":"10.1016\/j.rcim.2025.103064_b181"},{"year":"2023","author":"Bousmalis","series-title":"RoboCat: A self-improving generalist agent for robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b182"},{"year":"2024","author":"Wang","series-title":"Scaling proprioceptive-visual learning with heterogeneous pre-trained transformers","key":"10.1016\/j.rcim.2025.103064_b183"},{"year":"2021","author":"Makoviychuk","series-title":"Isaac Gym: High performance GPU-based physics simulation for robot learning","key":"10.1016\/j.rcim.2025.103064_b184"},{"year":"2022","author":"Zhu","series-title":"Robosuite: A modular simulation framework and benchmark for robot learning","key":"10.1016\/j.rcim.2025.103064_b185"},{"year":"2024","author":"Lin","series-title":"Data scaling laws in imitation learning for robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b186"},{"year":"2024","author":"Khazatsky","series-title":"DROID: A large-scale in-the-wild robot manipulation dataset","key":"10.1016\/j.rcim.2025.103064_b187"},{"year":"2023","author":"Fang","series-title":"RH20T: A comprehensive robotic dataset for learning diverse skills in one-shot","key":"10.1016\/j.rcim.2025.103064_b188"},{"year":"2023","author":"Kumar","series-title":"RoboHive: A unified framework for robot learning","key":"10.1016\/j.rcim.2025.103064_b189"},{"year":"2023","author":"Gu","series-title":"ManiSkill2: A unified benchmark for generalizable manipulation skills","key":"10.1016\/j.rcim.2025.103064_b190"},{"year":"2024","author":"Tao","series-title":"ManiSkill3: GPU parallelized robotics simulation and rendering for generalizable embodied AI","key":"10.1016\/j.rcim.2025.103064_b191"},{"issue":"6","key":"10.1016\/j.rcim.2025.103064_b192","doi-asserted-by":"crossref","first-page":"3740","DOI":"10.1109\/LRA.2023.3270034","article-title":"Orbit: A unified simulation framework for interactive robot learning environments","volume":"8","author":"Mittal","year":"2023","journal-title":"IEEE Robot. Autom. Lett."},{"year":"2024","author":"Nasiriany","series-title":"RoboCasa: Large-scale simulation of everyday tasks for generalist robots","key":"10.1016\/j.rcim.2025.103064_b193"},{"year":"2021","author":"Shen","series-title":"Igibson 1.0: a simulation environment for interactive tasks in large realistic scenes","key":"10.1016\/j.rcim.2025.103064_b194"},{"year":"2022","author":"Kolve","series-title":"AI2-THOR: An interactive 3D environment for visual AI","key":"10.1016\/j.rcim.2025.103064_b195"},{"year":"2018","author":"Puig","series-title":"VirtualHome: Simulating household activities via programs","key":"10.1016\/j.rcim.2025.103064_b196"},{"year":"2019","author":"Gupta","series-title":"Relay policy learning: Solving Long-Horizon tasks via imitation and reinforcement learning","key":"10.1016\/j.rcim.2025.103064_b197"},{"year":"2021","author":"Srivastava","series-title":"BEHAVIOR: Benchmark for everyday household activities in virtual, interactive, and ecological environments","key":"10.1016\/j.rcim.2025.103064_b198"},{"year":"2019","author":"Gao","series-title":"VRKitchen: an interactive 3D virtual environment for task-oriented learning","key":"10.1016\/j.rcim.2025.103064_b199"},{"year":"2023","author":"Puig","series-title":"Habitat 3.0: A co-habitat for humans, avatars and robots","key":"10.1016\/j.rcim.2025.103064_b200"},{"year":"2021","author":"Mandlekar","series-title":"What matters in learning from offline human demonstrations for robot manipulation","key":"10.1016\/j.rcim.2025.103064_b201"},{"year":"2021","author":"Yu","series-title":"Meta-World: A benchmark and evaluation for Multi-Task and meta reinforcement learning","key":"10.1016\/j.rcim.2025.103064_b202"},{"year":"2019","author":"James","series-title":"RLBench: The robot learning benchmark & learning environment","key":"10.1016\/j.rcim.2025.103064_b203"},{"year":"2024","author":"Authors","series-title":"Genesis: A universal and generative physics engine for robotics and beyond","key":"10.1016\/j.rcim.2025.103064_b204"},{"year":"2023","author":"Chen","series-title":"LL3DA: Visual interactive instruction tuning for omni-3D understanding, reasoning, and planning","key":"10.1016\/j.rcim.2025.103064_b205"},{"year":"2023","author":"Liu","series-title":"Robot learning on the job: Human-in-the-loop autonomy and learning during deployment","key":"10.1016\/j.rcim.2025.103064_b206"},{"key":"10.1016\/j.rcim.2025.103064_b207","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2024.102728","article-title":"Leveraging error-assisted fine-tuning large language models for manufacturing excellence","volume":"88","author":"Xia","year":"2024","journal-title":"Robot. Comput.-Integr. Manuf."},{"issue":"9","key":"10.1016\/j.rcim.2025.103064_b208","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for Vision-Language models","volume":"130","author":"Zhou","year":"2022","journal-title":"Int. J. Comput. Vis."},{"year":"2024","author":"Zheng","series-title":"LlamaFactory: Unified efficient fine-tuning of 100+ language models","key":"10.1016\/j.rcim.2025.103064_b209"},{"year":"2024","author":"Amatriain","series-title":"Prompt design and engineering: Introduction and advanced methods","key":"10.1016\/j.rcim.2025.103064_b210"},{"year":"2024","author":"Schulhoff","series-title":"The prompt report: A systematic survey of prompting techniques","key":"10.1016\/j.rcim.2025.103064_b211"},{"year":"2023","author":"Mu","series-title":"EmbodiedGPT: Vision-language pre-training via embodied chain of thought","key":"10.1016\/j.rcim.2025.103064_b212"},{"year":"2024","author":"Luo","series-title":"Precise and dexterous robotic manipulation via Human-in-the-Loop reinforcement learning","key":"10.1016\/j.rcim.2025.103064_b213"},{"year":"2024","author":"Schreiter","series-title":"Human gaze and head rotation during navigation, exploration and object manipulation in shared environments with robots","key":"10.1016\/j.rcim.2025.103064_b214"},{"year":"2023","author":"Mandi","series-title":"CACTI: A framework for scalable multi-task multi-scene visual imitation learning","key":"10.1016\/j.rcim.2025.103064_b215"},{"year":"2023","author":"Chen","series-title":"GenAug: Retargeting behaviors to unseen situations via generative augmentation","key":"10.1016\/j.rcim.2025.103064_b216"},{"year":"2023","author":"Xiao","series-title":"Robotic skill acquisition via instruction augmentation with vision-language models","key":"10.1016\/j.rcim.2025.103064_b217"},{"year":"2024","author":"Wang","series-title":"RoboGen: Towards unleashing infinite data for automated robot learning via generative simulation","key":"10.1016\/j.rcim.2025.103064_b218"},{"key":"10.1016\/j.rcim.2025.103064_b219","doi-asserted-by":"crossref","first-page":"676","DOI":"10.1016\/j.jmsy.2024.04.016","article-title":"Enhancing human-guided robotic assembly: AR-assisted DT for skill-based and low-code programming","volume":"74","author":"Yin","year":"2024","journal-title":"J. Manuf. Syst."},{"year":"2023","author":"Du","series-title":"Video language planning","key":"10.1016\/j.rcim.2025.103064_b220"},{"key":"10.1016\/j.rcim.2025.103064_b221","first-page":"1","article-title":"Reactive human-to-robot dexterous handovers for anthropomorphic hand","author":"Duan","year":"2024","journal-title":"IEEE Trans. Robot."},{"year":"2024","author":"Wang","series-title":"ContactHandover: Contact-guided robot-to-human object handover","key":"10.1016\/j.rcim.2025.103064_b222"},{"issue":"1","key":"10.1016\/j.rcim.2025.103064_b223","doi-asserted-by":"crossref","first-page":"13","DOI":"10.1016\/j.cirp.2024.03.004","article-title":"Vision AI-based human-robot collaborative assembly driven by autonomous robots","volume":"73","author":"Liu","year":"2024","journal-title":"CIRP Ann"},{"key":"10.1016\/j.rcim.2025.103064_b224","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2024.102859","article-title":"Ergonomic workplace design based on real-time integration between virtual and augmented realities","volume":"92","author":"Chu","year":"2025","journal-title":"Robot. Comput.-Integr. Manuf."},{"key":"10.1016\/j.rcim.2025.103064_b225","doi-asserted-by":"crossref","DOI":"10.3389\/frobt.2020.542406","article-title":"The grasp strategy of a robot passer influences performance and quality of the Robot-Human object handover","volume":"7","author":"Ortenzi","year":"2020","journal-title":"Front. Robot. AI"},{"key":"10.1016\/j.rcim.2025.103064_b226","series-title":"Modeling human reaching phase in human-human object handover with application in robot-human handover","first-page":"3597","author":"Parastegari","year":"2017"},{"key":"10.1016\/j.rcim.2025.103064_b227","series-title":"Fast and Comfortable Interactive Robot-to-Human Object Handover","first-page":"3701","author":"Meng","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b228","series-title":"Optimizing Robot-to-Human Object Handovers using Vision-based Affordance Information","first-page":"1","author":"Lehotsky","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b229","series-title":"Object Transfer Point Estimation for Fluent Human-Robot Handovers","first-page":"2627","author":"Nemlekar","year":"2019"},{"key":"10.1016\/j.rcim.2025.103064_b230","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TIM.2021.3118090","article-title":"Object transfer point predicting based on human comfort model for human-robot handover","volume":"70","author":"Liu","year":"2021","journal-title":"IEEE Trans. Instrum. Meas."},{"issue":"2","key":"10.1016\/j.rcim.2025.103064_b231","doi-asserted-by":"crossref","first-page":"3136","DOI":"10.1109\/LRA.2021.3062808","article-title":"Affordance-Aware handovers with human arm mobility constraints","volume":"6","author":"Ardon","year":"2021","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.rcim.2025.103064_b232","series-title":"Human Grasp Classification for Reactive Human-to-Robot Handovers","first-page":"11123","author":"Yang","year":"2020"},{"year":"2021","author":"Yang","series-title":"Reactive Human-to-Robot handovers of arbitrary objects","key":"10.1016\/j.rcim.2025.103064_b233"},{"year":"2022","author":"Chao","series-title":"HandoverSim: A simulation framework and benchmark for human-to-robot object handovers","key":"10.1016\/j.rcim.2025.103064_b234"},{"year":"2023","author":"Christen","series-title":"SynH2R: Synthesizing hand-object motions for learning human-to-robot handovers","key":"10.1016\/j.rcim.2025.103064_b235"},{"year":"2024","author":"Wang","series-title":"GenH2R: Learning generalizable human-to-robot handover via scalable simulation, demonstration, and imitation","key":"10.1016\/j.rcim.2025.103064_b236"},{"year":"2023","author":"Makatura","series-title":"How can large language models help humans in design and manufacturing?","key":"10.1016\/j.rcim.2025.103064_b237"},{"key":"10.1016\/j.rcim.2025.103064_b238","doi-asserted-by":"crossref","DOI":"10.1016\/j.jmsy.2024.04.020","article-title":"An LLM-based vision and language cobot navigation approach for Human-centric smart manufacturing","author":"Wang","year":"2024","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b239","doi-asserted-by":"crossref","DOI":"10.1016\/j.aei.2023.102333","article-title":"CausalKGPT: Industrial structure causal knowledge-enhanced large language model for cause analysis of quality problems in aerospace product manufacturing","volume":"59","author":"Zhou","year":"2024","journal-title":"Adv. Eng. Inform."},{"issue":"5\u20136","key":"10.1016\/j.rcim.2025.103064_b240","doi-asserted-by":"crossref","first-page":"2461","DOI":"10.1007\/s00170-024-13861-9","article-title":"Generative AI and DT integrated intelligent process planning: a conceptual framework","volume":"133","author":"Xu","year":"2024","journal-title":"Int. J. Adv. Manuf. Technol."},{"key":"10.1016\/j.rcim.2025.103064_b241","doi-asserted-by":"crossref","first-page":"83","DOI":"10.1016\/j.jmsy.2024.02.015","article-title":"Empowering digital twins with large language models for global temporal feature learning","volume":"74","author":"Sun","year":"2024","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b242","doi-asserted-by":"crossref","first-page":"314","DOI":"10.1016\/j.jmsy.2024.08.003","article-title":"Industrial Metaverse: A proactive human-robot collaboration perspective","volume":"76","author":"Li","year":"2024","journal-title":"J. Manuf. Syst."},{"year":"2025","author":"Yajima","series-title":"Zero-shot peg insertion: Identifying mating holes and estimating SE(2) poses with vision-language models","key":"10.1016\/j.rcim.2025.103064_b243"},{"issue":"2","key":"10.1016\/j.rcim.2025.103064_b244","doi-asserted-by":"crossref","first-page":"1141","DOI":"10.1007\/s10845-023-02294-y","article-title":"Embodied intelligence in manufacturing: leveraging large language models for autonomous industrial robotics","volume":"36","author":"Fan","year":"2025","journal-title":"J. Intell. Manuf."},{"year":"2023","author":"Zhang","series-title":"NOIR: Neural signal operated intelligent robots for everyday activities","key":"10.1016\/j.rcim.2025.103064_b245"},{"issue":"1","key":"10.1016\/j.rcim.2025.103064_b246","doi-asserted-by":"crossref","first-page":"5","DOI":"10.1016\/j.cirp.2021.04.091","article-title":"Function block-based human-robot collaborative assembly driven by brainwaves","volume":"70","author":"Wang","year":"2021","journal-title":"CIRP Ann"},{"issue":"9","key":"10.1016\/j.rcim.2025.103064_b247","doi-asserted-by":"crossref","first-page":"3083","DOI":"10.1007\/s00170-021-07937-z","article-title":"A closed-loop brain-computer interface with augmented reality feedback for industrial human-robot collaboration","volume":"124","author":"Ji","year":"2023","journal-title":"Int. J. Adv. Manuf. Technol."},{"year":"2024","author":"Jiang","series-title":"Large brain model for learning generic representations with tremendous EEG data in BCI","key":"10.1016\/j.rcim.2025.103064_b248"},{"year":"2024","author":"Guan","series-title":"HallusionBench: An advanced diagnostic suite for entangled language hallucination and visual illusion in large vision-language models","key":"10.1016\/j.rcim.2025.103064_b249"},{"year":"2024","author":"Sun","series-title":"Tools fail: Detecting silent errors in faulty tools","key":"10.1016\/j.rcim.2025.103064_b250"},{"year":"2023","author":"Liu","series-title":"REFLECT: Summarizing robot experiences for failure explanation and correction","key":"10.1016\/j.rcim.2025.103064_b251"},{"year":"2024","author":"Liu","series-title":"Self-corrected multimodal large language model for End-to-End robot manipulation","key":"10.1016\/j.rcim.2025.103064_b252"},{"year":"2024","author":"Xiong","series-title":"AIC MLLM: Autonomous interactive correction MLLM for robust robotic manipulation","key":"10.1016\/j.rcim.2025.103064_b253"},{"year":"2024","author":"Cheng","series-title":"EgoThink: Evaluating first-person perspective thinking capability of vision-language models","key":"10.1016\/j.rcim.2025.103064_b254"},{"year":"2024","author":"Chavis","series-title":"Simultaneous localization and affordance prediction for tasks in egocentric video","key":"10.1016\/j.rcim.2025.103064_b255"},{"year":"2024","author":"Kannan","series-title":"SMART-LLM: Smart multi-agent robot task planning using large language models","key":"10.1016\/j.rcim.2025.103064_b256"},{"year":"2023","author":"Mandi","series-title":"Roco: Dialectic multi-robot collaboration with large language models","key":"10.1016\/j.rcim.2025.103064_b257"},{"year":"2024","author":"Ahn","series-title":"AutoRT: Embodied foundation models for large scale orchestration of robotic agents","key":"10.1016\/j.rcim.2025.103064_b258"},{"year":"2023","author":"Girdhar","series-title":"ImageBind: One embedding space to bind them all","key":"10.1016\/j.rcim.2025.103064_b259"},{"year":"2023","author":"Zhang","series-title":"Meta-transformer: A unified framework for multimodal learning","key":"10.1016\/j.rcim.2025.103064_b260"}],"container-title":["Robotics and Computer-Integrated Manufacturing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0736584525001188?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0736584525001188?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:14:36Z","timestamp":1750176876000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0736584525001188"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":260,"alternative-id":["S0736584525001188"],"URL":"https:\/\/doi.org\/10.1016\/j.rcim.2025.103064","relation":{},"ISSN":["0736-5845"],"issn-type":[{"type":"print","value":"0736-5845"}],"subject":[],"published":{"date-parts":[[2026,2]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Empowering natural human\u2013robot collaboration through multimodal language models and spatial intelligence: Pathways and perspectives","name":"articletitle","label":"Article Title"},{"value":"Robotics and Computer-Integrated Manufacturing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.rcim.2025.103064","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103064"}}