{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T08:38:01Z","timestamp":1780389481286,"version":"3.54.1"},"reference-count":260,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Robotics and Computer-Integrated Manufacturing"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1016\/j.rcim.2025.103064","type":"journal-article","created":{"date-parts":[[2025,6,11]],"date-time":"2025-06-11T06:09:02Z","timestamp":1749622142000},"page":"103064","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":16,"special_numbering":"C","title":["Empowering natural human\u2013robot collaboration through multimodal language models and spatial intelligence: Pathways and perspectives"],"prefix":"10.1016","volume":"97","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9052-6468","authenticated-orcid":false,"given":"Duidi","family":"Wu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pai","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qianyou","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuo","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jin","family":"Qi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jie","family":"Hu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guo-Niu","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lihui","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.rcim.2025.103064_b1","doi-asserted-by":"crossref","first-page":"612","DOI":"10.1016\/j.jmsy.2022.02.001","article-title":"Outlook on human-centric manufacturing towards Industry 5.0","volume":"62","author":"Lu","year":"2022","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b2","doi-asserted-by":"crossref","first-page":"199","DOI":"10.1016\/j.jmsy.2021.11.001","article-title":"A futuristic perspective on human-centric assembly","volume":"62","author":"Wang","year":"2022","journal-title":"J. Manuf. Syst."},{"issue":"3","key":"10.1016\/j.rcim.2025.103064_b3","doi-asserted-by":"crossref","DOI":"10.1002\/aisy.202300359","article-title":"Multimodal Human\u2013Robot interaction for Human-Centric smart manufacturing: A survey","volume":"6","author":"Wang","year":"2024","journal-title":"Adv. Intell. Syst."},{"issue":"18","key":"10.1016\/j.rcim.2025.103064_b4","doi-asserted-by":"crossref","first-page":"10355","DOI":"10.1109\/JSEN.2020.2995271","article-title":"Progress and prospects of multimodal fusion methods in physical Human\u2013Robot interaction: A review","volume":"20","author":"Xue","year":"2020","journal-title":"IEEE Sens. J."},{"key":"10.1016\/j.rcim.2025.103064_b5","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2022.102510","article-title":"Proactive human\u2013robot collaboration: Mutual-cognitive, predictable, and self-organising perspectives","volume":"81","author":"Li","year":"2023","journal-title":"Robot. Comput.-Integr. Manuf."},{"key":"10.1016\/j.rcim.2025.103064_b6","doi-asserted-by":"crossref","first-page":"74762","DOI":"10.1109\/ACCESS.2018.2884793","article-title":"Towards robust Human-Robot collaborative manufacturing: Multimodal fusion","volume":"6","author":"Liu","year":"2018","journal-title":"IEEE Access"},{"key":"10.1016\/j.rcim.2025.103064_b7","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2023.102610","article-title":"Cognitive neuroscience and robotics: Advancements and future research directions","volume":"85","author":"Liu","year":"2024","journal-title":"Robot. Comput.-Integr. Manuf."},{"key":"10.1016\/j.rcim.2025.103064_b8","series-title":"Automated creation of digital cousins for robust policy learning","author":"Dai","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b9","series-title":"Foundation models in robotics: Applications, challenges, and the future","author":"Firoozi","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b10","series-title":"Robot learning in the era of foundation models: A survey","author":"Xiao","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b11","series-title":"Large language models for robotics: A survey","author":"Zeng","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b12","series-title":"Toward general-purpose robots via foundation models: A survey and meta-analysis","author":"Hu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b13","series-title":"Real-world robot applications of foundation models: A review","author":"Kawaharazuka","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b14","series-title":"Agent AI: Surveying the horizons of multimodal interaction","author":"Durante","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b15","series-title":"VoxPoser: Composable 3D value maps for robotic manipulation with language models","author":"Huang","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b16","series-title":"A comprehensive survey on pretrained foundation models: A history from BERT to ChatGPT","author":"Zhou","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b17","series-title":"GPT-4 technical report","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b18","series-title":"Gemini: A family of highly capable multimodal models","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b19","series-title":"Training language models to follow instructions with human feedback","author":"Ouyang","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b20","series-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b21","series-title":"Stanford alpaca: An instruction-following LLaMA model","author":"Taori","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b22","series-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality","author":"Chiang","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b23","series-title":"Palm-E: An embodied multimodal language model","author":"Driess","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b24","series-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b25","series-title":"Training data-efficient image transformers & distillation through attention","author":"Touvron","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b26","series-title":"Pyramid vision transformer: A versatile backbone for dense prediction without convolutions","author":"Wang","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b27","series-title":"Florence: A new foundation model for computer vision","author":"Yuan","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b28","series-title":"Unifying vision-and-language tasks via text generation","author":"Cho","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b29","series-title":"Flamingo: a visual language model for few-shot learning","author":"Alayrac","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b30","series-title":"RT-1: Robotics Transformer for Real-World Control at Scale","author":"Brohan","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b31","series-title":"VIMA: General robot manipulation with multimodal prompts","author":"Jiang","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b32","series-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b33","series-title":"QUAR-VLA: Vision-language-action model for quadruped robots","author":"Ding","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b34","series-title":"CoVla: Comprehensive vision-language-action dataset for autonomous driving","author":"Arai","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b35","series-title":"Visual instruction tuning","author":"Liu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b36","series-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b37","series-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"Lu","year":"2019"},{"key":"10.1016\/j.rcim.2025.103064_b38","series-title":"Grounded language-image pre-training","author":"Li","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b39","series-title":"High-resolution image synthesis with latent diffusion models","author":"Rombach","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b40","series-title":"AnyGPT: Unified multimodal LLM with discrete sequence modeling","author":"Zhan","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b41","series-title":"NExT-GPT: Any-to-Any multimodal LLM","author":"Wu","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b42","series-title":"ModaVerse: Efficiently transforming modalities with LLMs","author":"Wang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b43","series-title":"A touch, vision, and language dataset for multimodal alignment","author":"Fu","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b44","doi-asserted-by":"crossref","first-page":"1188","DOI":"10.1016\/j.procir.2020.03.022","article-title":"Symbiotic human-robot collaboration: multimodal control using function blocks","volume":"93","author":"Liu","year":"2020","journal-title":"Procedia CIRP"},{"key":"10.1016\/j.rcim.2025.103064_b45","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2022.102359","article-title":"An electromyography signals-based human-robot collaboration system for human motion intention recognition and realization","volume":"77","author":"Zhang","year":"2022","journal-title":"Robot. Comput.-Integr. Manuf."},{"key":"10.1016\/j.rcim.2025.103064_b46","series-title":"ActionCLIP: A new paradigm for video action recognition","author":"Wang","year":"2021"},{"issue":"2","key":"10.1016\/j.rcim.2025.103064_b47","doi-asserted-by":"crossref","first-page":"392","DOI":"10.1007\/s11263-023-01876-w","article-title":"Transferring Vision-Language models for visual recognition: A classifier perspective","volume":"132","author":"Wu","year":"2024","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.rcim.2025.103064_b48","series-title":"AToM-Bot: Embodied fulfillment of unspoken human needs with affective theory of mind","author":"Ding","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b49","series-title":"LIT: Large language model driven intention tracking for proactive human-robot collaboration \u2013 A robot Sous-Chef application","author":"Huang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b50","series-title":"MotionDiffuse: Text-Driven human motion generation with diffusion model","author":"Zhang","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b51","series-title":"MotionCLIP: Exposing human motion generation to CLIP space","author":"Tevet","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b52","series-title":"MotionGPT: Human motion as a foreign language","author":"Jiang","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b53","series-title":"MDETR \u2013 Modulated detection for End-to-End multi-modal understanding","author":"Kamath","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b54","series-title":"Simple open-vocabulary object detection with vision transformers","author":"Minderer","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b55","series-title":"Emerging properties in Self-Supervised vision transformers","author":"Caron","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b56","series-title":"Grounding DINO: Marrying DINO with grounded Pre-Training for Open-Set object detection","author":"Liu","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b57","series-title":"Sigmoid loss for language image Pre-Training","author":"Zhai","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b58","series-title":"Scaling robot learning with semantically imagined experience","author":"Yu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b59","series-title":"Physically grounded vision-language models for robotic manipulation","author":"Gao","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b60","series-title":"Look before you leap: Unveiling the power of GPT-4V in robotic vision-language planning","author":"Hu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b61","series-title":"Open-World object manipulation using Pre-trained vision-language models","author":"Stone","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b62","series-title":"DINOv2: Learning robust visual features without supervision","author":"Oquab","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b63","series-title":"Segment anything","author":"Kirillov","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b64","series-title":"Instruct2Act: Mapping multi-modality instructions to robotic actions with large language model","author":"Huang","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b65","series-title":"CoPa: General robotic manipulation through spatial constraints of parts with foundation models","author":"Huang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b66","series-title":"Grounded SAM: Assembling open-world models for diverse visual tasks","author":"Ren","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b67","series-title":"MOKA: Open-Vocabulary robotic manipulation through Mark-Based visual prompting","author":"Liu","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b68","series-title":"Rekep: Spatio-Temporal reasoning of relational keypoint constraints for robotic manipulation","author":"Huang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b69","series-title":"RAM: Retrieval-based affordance transfer for generalizable Zero-Shot robotic manipulation","author":"Kuang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b70","series-title":"Point-bind & point-LLM: Aligning point cloud with Multi-modality for 3D understanding, generation, and instruction following","author":"Guo","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b71","series-title":"PointLLM: Empowering large language models to understand point clouds","author":"Xu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b72","series-title":"SpatialVLM: Endowing vision-language models with spatial reasoning capabilities","author":"Chen","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b73","series-title":"LM-Nav: Robotic navigation with large pre-trained models of language, vision, and action","author":"Shah","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b74","series-title":"Open-vocabulary queryable scene representations for real world planning","author":"Chen","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b75","series-title":"Navid: Video-based VLM plans the next step for vision-and-language navigation","author":"Zhang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b76","series-title":"VLM see, robot do: Human demo video to robot action plan via vision language model","author":"Wang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b77","doi-asserted-by":"crossref","first-page":"3897","DOI":"10.1109\/TRO.2024.3420722","article-title":"Learning Human-Like functional grasping for multifinger hands from few demonstrations","volume":"40","author":"Wei","year":"2024","journal-title":"IEEE Trans. Robot."},{"key":"10.1016\/j.rcim.2025.103064_b78","series-title":"OKAMI: Teaching humanoid robots manipulation skills through single video imitation","author":"Li","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b79","series-title":"CLIPort: What and where pathways for robotic manipulation","author":"Shridhar","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b80","series-title":"Do as I can, not as I say: Grounding language in robotic affordances","author":"Ahn","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b81","series-title":"Language-driven representation learning for robotics","author":"Karamcheti","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b82","series-title":"KALIE: Fine-tuning vision-language models for open-world manipulation without robot data","author":"Tang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b83","series-title":"RoboPoint: A vision-language model for spatial affordance prediction for robotics","author":"Yuan","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b84","series-title":"HRP: Human affordances for robotic pre-training","author":"Srirama","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b85","series-title":"Grounding language with visual affordances over unstructured data","author":"Mees","year":"2023"},{"issue":"4","key":"10.1016\/j.rcim.2025.103064_b86","doi-asserted-by":"crossref","first-page":"3308","DOI":"10.1109\/LRA.2018.2852786","article-title":"Interactive Text2Pickup networks for natural language-based Human\u2013Robot collaboration","volume":"3","author":"Ahn","year":"2018","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.rcim.2025.103064_b87","series-title":"Affordances from Human Videos as a Versatile Representation for Robotics","first-page":"01","author":"Bahl","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b88","series-title":"kPAM: KeyPoint affordances for Category-Level robotic manipulation","author":"Manuelli","year":"2019"},{"key":"10.1016\/j.rcim.2025.103064_b89","series-title":"Grounded decoding: Guiding text generation with grounded models for embodied agents","author":"Huang","year":"2023"},{"issue":"8","key":"10.1016\/j.rcim.2025.103064_b90","doi-asserted-by":"crossref","first-page":"1345","DOI":"10.1007\/s10514-023-10131-7","article-title":"Text2Motion: from natural language instructions to feasible plans","volume":"47","author":"Lin","year":"2023","journal-title":"Auton. Robots"},{"key":"10.1016\/j.rcim.2025.103064_b91","series-title":"Inner monologue: Embodied reasoning through planning with language models","author":"Huang","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b92","series-title":"Code as policies: Language model programs for embodied control","author":"Liang","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b93","series-title":"ProgPrompt: Generating situated robot task plans using large language models","author":"Singh","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b94","series-title":"Socratic models: Composing zero-shot multimodal reasoning with language","author":"Zeng","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b95","series-title":"ChatGPT for robotics: Design principles and model abilities","author":"Vemprala","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b96","series-title":"RobotGPT: Robot manipulation learning from ChatGPT","author":"Jin","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b97","series-title":"How to Prompt Your Robot: A PromptBook for Manipulation Skills with Code as Policies","first-page":"4340","author":"Arenas","year":"2024"},{"issue":"8","key":"10.1016\/j.rcim.2025.103064_b98","doi-asserted-by":"crossref","first-page":"6904","DOI":"10.1109\/LRA.2024.3415931","article-title":"Enhancing the LLM-based robot manipulation through Human-Robot collaboration","volume":"9","author":"Liu","year":"2024","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.rcim.2025.103064_b99","series-title":"ReplanVLM: Replanning robotic tasks with visual language models","author":"Mei","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b100","series-title":"TidyBot: Personalized robot assistance with large language models","author":"Wu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b101","series-title":"Task-Oriented Multi-Modal Question Answering For Collaborative Applications","first-page":"1426","author":"Tan","year":"2020"},{"key":"10.1016\/j.rcim.2025.103064_b102","first-page":"1","article-title":"A question answering system for assembly process of wind turbines based on multi-modal knowledge graph and large language model","author":"Hu","year":"2023","journal-title":"J. Eng. Des."},{"key":"10.1016\/j.rcim.2025.103064_b103","series-title":"Transporter networks: Rearranging the visual world for robotic manipulation","author":"Zeng","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b104","series-title":"GenSim2: Scaling robot data generation with Multi-modal and reasoning LLMs","author":"Hua","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b105","doi-asserted-by":"crossref","first-page":"1009","DOI":"10.1016\/j.jmsy.2024.05.003","article-title":"A vision-language-guided robotic action planning approach for ambiguity mitigation in human\u2013robot collaborative manufacturing","volume":"74","author":"Fan","year":"2024","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b106","article-title":"A vision-language-guided and deep reinforcement learning-enabled approach for unstructured human-robot collaborative manufacturing task fulfilment","author":"Zheng","year":"2024","journal-title":"Vis."},{"key":"10.1016\/j.rcim.2025.103064_b107","series-title":"Vision-language interpreter for robot task planning","author":"Shirai","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b108","series-title":"R3M: A universal visual representation for robot manipulation","author":"Nair","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b109","series-title":"R+x: Retrieval and execution from everyday human videos","author":"Papagiannis","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b110","series-title":"RT-Trajectory: Robotic task generalization via hindsight trajectory sketches","author":"Gu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b111","series-title":"Gen2Act: Human video generation in novel scenarios enables generalizable robot manipulation","author":"Bharadhwaj","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b112","series-title":"EgoMimic: Scaling imitation learning via egocentric video","author":"Kareer","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b113","series-title":"Learning latent plans from play","author":"Lynch","year":"2019"},{"key":"10.1016\/j.rcim.2025.103064_b114","series-title":"Language conditioned imitation learning over unstructured data","author":"Lynch","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b115","series-title":"BC-Z: Zero-Shot task generalization with robotic imitation learning","author":"Jang","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b116","series-title":"Interactive language: Talking to robots in real time","author":"Lynch","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b117","series-title":"CALVIN: A benchmark for language-conditioned policy learning for long-horizon robot manipulation tasks","author":"Mees","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b118","series-title":"What matters in language conditioned robotic imitation learning over unstructured data","author":"Mees","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b119","series-title":"Implicit behavioral cloning","author":"Florence","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b120","series-title":"Behavior transformers: Cloning $k$ modes with one stone","author":"Shafiullah","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b121","series-title":"Learning fine-grained bimanual manipulation with low-cost hardware","author":"Zhao","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b122","series-title":"Unleashing large-scale video generative pre-training for visual robot manipulation","author":"Wu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b123","series-title":"Zero-shot robotic manipulation with pretrained image-editing diffusion models","author":"Black","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b124","series-title":"Goal representations for instruction following: A semi-supervised language interface to control","author":"Myers","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b125","series-title":"Learning video-conditioned policies for unseen manipulation tasks","author":"Chane-Sane","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b126","series-title":"Towards a unified agent with foundation models","author":"Di Palo","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b127","series-title":"Language to rewards for robotic skill synthesis","author":"Yu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b128","series-title":"Vision-language models are Zero-Shot reward models for reinforcement learning","author":"Rocamonde","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b129","series-title":"Self-refined large language model as automated reward function designer for deep reinforcement learning in robotics","author":"Song","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b130","series-title":"Eureka: Human-Level reward design via coding large language models","author":"Ma","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b131","series-title":"Text2Reward: Reward shaping with language models for reinforcement learning","author":"Xie","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b132","series-title":"LIV: Language-Image representations and rewards for robotic control","author":"Ma","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b133","series-title":"Decision transformer: Reinforcement learning via sequence modeling","author":"Chen","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b134","series-title":"RE-MOVE: An adaptive policy design for robotic navigation tasks in dynamic environments via language-based feedback","author":"Chakraborty","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b135","series-title":"Unleashing the power of pre-trained language models for offline reinforcement learning","author":"Shi","year":"2023"},{"issue":"12\u201314","key":"10.1016\/j.rcim.2025.103064_b136","doi-asserted-by":"crossref","first-page":"1419","DOI":"10.1177\/02783649211046285","article-title":"Concept2Robot: Learning manipulation concepts from instructions and human demonstrations","volume":"40","author":"Shao","year":"2021","journal-title":"Int. J. Robot. Res."},{"key":"10.1016\/j.rcim.2025.103064_b137","series-title":"Can offline reinforcement learning help natural language understanding?","author":"Zhang","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b138","series-title":"AMAGO: Scalable in-context reinforcement learning for adaptive agents","author":"Grigsby","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b139","series-title":"Auto MC-reward: Automated dense reward design with large language models for minecraft","author":"Li","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b140","series-title":"Survey on large language model-enhanced reinforcement learning: Concept, taxonomy, and methods","author":"Cao","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b141","series-title":"Diffusion policy: Visuomotor policy learning via action diffusion","author":"Chi","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b142","series-title":"3D diffuser actor: Policy diffusion with 3D scene representations","author":"Ke","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b143","series-title":"Poco: Policy composition from and for heterogeneous robot learning","author":"Wang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b144","series-title":"Multimodal diffusion transformer: Learning versatile behavior from multimodal goals","author":"Reuss","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b145","series-title":"ARDuP: Active region video diffusion for universal policies","author":"Huang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b146","series-title":"Scaling up and distilling down: Language-Guided robot skill acquisition","author":"Ha","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b147","series-title":"Octo: An open-source generalist robot policy","author":"Team","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b148","series-title":"ALOHA unleashed: A simple recipe for robot dexterity","author":"Zhao","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b149","series-title":"RDT-1B: a diffusion foundation model for bimanual manipulation","author":"Liu","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b150","series-title":"$ _0$: A Vision-Language-Action flow model for general robot control","author":"Black","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b151","series-title":"RoboDreamer: Learning compositional world models for robot imagination","author":"Zhou","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b152","series-title":"Q-Transformer: Scalable offline reinforcement learning via autoregressive Q-Functions","author":"Chebotar","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b153","series-title":"RT-H: Action hierarchies using language","author":"Belkhale","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b154","series-title":"Bi-VLA: Vision-Language-Action Model-Based system for bimanual robotic dexterous manipulations","author":"Gbagbe","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b155","series-title":"OpenVLA: An open-source vision-language-action model","author":"Kim","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b156","series-title":"TinyVLA: Towards fast, data-efficient vision-language-action models for robotic manipulation","author":"Wen","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b157","series-title":"LLaRA: Supercharging robot learning data for vision-language policy","author":"Li","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b158","series-title":"Vision-language foundation models as effective robot imitators","author":"Li","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b159","series-title":"RoboUniView: Visual-language model with unified view representation for robotic manipulation","author":"Liu","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b160","series-title":"RoboMamba: Multimodal state space model for efficient robot reasoning and manipulation","author":"Liu","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b161","series-title":"Manipulate-Anything: Automating real-world robots using vision-language models","author":"Duan","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b162","series-title":"2024 IEEE\/RSJ International Conference on Intelligent Robots and Systems","first-page":"7359","article-title":"Open6DOR: Benchmarking open-instruction 6-DoF object rearrangement and a VLM-based approach","author":"Ding","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b163","series-title":"3D-VLA: A 3D vision-language-action generative world model","author":"Zhen","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b164","series-title":"GR-2: A generative video-language-action model with web-scale knowledge for robot manipulation","author":"Cheang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b165","series-title":"EVA: An embodied world model for future video anticipation","author":"Chi","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b166","series-title":"PIVOT-R: Primitive-driven waypoint-aware world model for robotic manipulation","author":"Zhang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b167","series-title":"DINO-WM: World models on pre-trained visual features enable zero-shot planning","author":"Zhou","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b168","series-title":"WHALE: Towards generalizable and scalable world models for embodied decision-making","author":"Zhang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b169","series-title":"Real-world robot learning with masked visual pre-training","author":"Radosavovic","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b170","series-title":"Masked world models for visual control","author":"Seo","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b171","series-title":"Masked autoencoding for scalable and generalizable decision making","author":"Liu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b172","series-title":"Robot learning with sensorimotor pre-training","author":"Radosavovic","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b173","series-title":"SpawnNet: Learning generalizable visuomotor skills from Pre-trained networks","author":"Lin","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b174","series-title":"Robots pre-train robots: Manipulation-centric robotic representation from large-scale robot datasets","author":"Jiang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b175","series-title":"Open X-Embodiment: Robotic learning datasets and RT-x models","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b176","series-title":"BridgeData V2: A dataset for robot learning at scale","author":"Walke","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b177","series-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b178","series-title":"From LLMs to actions: Latent codes as bridges in hierarchical robot control","author":"Shentu","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b179","series-title":"World models","author":"Ha","year":"2018"},{"key":"10.1016\/j.rcim.2025.103064_b180","series-title":"VisualPredicator: Learning abstract world models with Neuro-Symbolic predicates for robot planning","author":"Liang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b181","series-title":"A generalist agent","author":"Reed","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b182","series-title":"RoboCat: A self-improving generalist agent for robotic manipulation","author":"Bousmalis","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b183","series-title":"Scaling proprioceptive-visual learning with heterogeneous pre-trained transformers","author":"Wang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b184","series-title":"Isaac Gym: High performance GPU-based physics simulation for robot learning","author":"Makoviychuk","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b185","series-title":"Robosuite: A modular simulation framework and benchmark for robot learning","author":"Zhu","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b186","series-title":"Data scaling laws in imitation learning for robotic manipulation","author":"Lin","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b187","series-title":"DROID: A large-scale in-the-wild robot manipulation dataset","author":"Khazatsky","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b188","series-title":"RH20T: A comprehensive robotic dataset for learning diverse skills in one-shot","author":"Fang","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b189","series-title":"RoboHive: A unified framework for robot learning","author":"Kumar","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b190","series-title":"ManiSkill2: A unified benchmark for generalizable manipulation skills","author":"Gu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b191","series-title":"ManiSkill3: GPU parallelized robotics simulation and rendering for generalizable embodied AI","author":"Tao","year":"2024"},{"issue":"6","key":"10.1016\/j.rcim.2025.103064_b192","doi-asserted-by":"crossref","first-page":"3740","DOI":"10.1109\/LRA.2023.3270034","article-title":"Orbit: A unified simulation framework for interactive robot learning environments","volume":"8","author":"Mittal","year":"2023","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.rcim.2025.103064_b193","series-title":"RoboCasa: Large-scale simulation of everyday tasks for generalist robots","author":"Nasiriany","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b194","series-title":"Igibson 1.0: a simulation environment for interactive tasks in large realistic scenes","author":"Shen","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b195","series-title":"AI2-THOR: An interactive 3D environment for visual AI","author":"Kolve","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b196","series-title":"VirtualHome: Simulating household activities via programs","author":"Puig","year":"2018"},{"key":"10.1016\/j.rcim.2025.103064_b197","series-title":"Relay policy learning: Solving Long-Horizon tasks via imitation and reinforcement learning","author":"Gupta","year":"2019"},{"key":"10.1016\/j.rcim.2025.103064_b198","series-title":"BEHAVIOR: Benchmark for everyday household activities in virtual, interactive, and ecological environments","author":"Srivastava","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b199","series-title":"VRKitchen: an interactive 3D virtual environment for task-oriented learning","author":"Gao","year":"2019"},{"key":"10.1016\/j.rcim.2025.103064_b200","series-title":"Habitat 3.0: A co-habitat for humans, avatars and robots","author":"Puig","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b201","series-title":"What matters in learning from offline human demonstrations for robot manipulation","author":"Mandlekar","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b202","series-title":"Meta-World: A benchmark and evaluation for Multi-Task and meta reinforcement learning","author":"Yu","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b203","series-title":"RLBench: The robot learning benchmark & learning environment","author":"James","year":"2019"},{"key":"10.1016\/j.rcim.2025.103064_b204","series-title":"Genesis: A universal and generative physics engine for robotics and beyond","author":"Authors","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b205","series-title":"LL3DA: Visual interactive instruction tuning for omni-3D understanding, reasoning, and planning","author":"Chen","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b206","series-title":"Robot learning on the job: Human-in-the-loop autonomy and learning during deployment","author":"Liu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b207","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2024.102728","article-title":"Leveraging error-assisted fine-tuning large language models for manufacturing excellence","volume":"88","author":"Xia","year":"2024","journal-title":"Robot. Comput.-Integr. Manuf."},{"issue":"9","key":"10.1016\/j.rcim.2025.103064_b208","doi-asserted-by":"crossref","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for Vision-Language models","volume":"130","author":"Zhou","year":"2022","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.rcim.2025.103064_b209","series-title":"LlamaFactory: Unified efficient fine-tuning of 100+ language models","author":"Zheng","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b210","series-title":"Prompt design and engineering: Introduction and advanced methods","author":"Amatriain","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b211","series-title":"The prompt report: A systematic survey of prompting techniques","author":"Schulhoff","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b212","series-title":"EmbodiedGPT: Vision-language pre-training via embodied chain of thought","author":"Mu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b213","series-title":"Precise and dexterous robotic manipulation via Human-in-the-Loop reinforcement learning","author":"Luo","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b214","series-title":"Human gaze and head rotation during navigation, exploration and object manipulation in shared environments with robots","author":"Schreiter","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b215","series-title":"CACTI: A framework for scalable multi-task multi-scene visual imitation learning","author":"Mandi","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b216","series-title":"GenAug: Retargeting behaviors to unseen situations via generative augmentation","author":"Chen","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b217","series-title":"Robotic skill acquisition via instruction augmentation with vision-language models","author":"Xiao","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b218","series-title":"RoboGen: Towards unleashing infinite data for automated robot learning via generative simulation","author":"Wang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b219","doi-asserted-by":"crossref","first-page":"676","DOI":"10.1016\/j.jmsy.2024.04.016","article-title":"Enhancing human-guided robotic assembly: AR-assisted DT for skill-based and low-code programming","volume":"74","author":"Yin","year":"2024","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b220","series-title":"Video language planning","author":"Du","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b221","first-page":"1","article-title":"Reactive human-to-robot dexterous handovers for anthropomorphic hand","author":"Duan","year":"2024","journal-title":"IEEE Trans. Robot."},{"key":"10.1016\/j.rcim.2025.103064_b222","series-title":"ContactHandover: Contact-guided robot-to-human object handover","author":"Wang","year":"2024"},{"issue":"1","key":"10.1016\/j.rcim.2025.103064_b223","doi-asserted-by":"crossref","first-page":"13","DOI":"10.1016\/j.cirp.2024.03.004","article-title":"Vision AI-based human-robot collaborative assembly driven by autonomous robots","volume":"73","author":"Liu","year":"2024","journal-title":"CIRP Ann"},{"key":"10.1016\/j.rcim.2025.103064_b224","doi-asserted-by":"crossref","DOI":"10.1016\/j.rcim.2024.102859","article-title":"Ergonomic workplace design based on real-time integration between virtual and augmented realities","volume":"92","author":"Chu","year":"2025","journal-title":"Robot. Comput.-Integr. Manuf."},{"key":"10.1016\/j.rcim.2025.103064_b225","doi-asserted-by":"crossref","DOI":"10.3389\/frobt.2020.542406","article-title":"The grasp strategy of a robot passer influences performance and quality of the Robot-Human object handover","volume":"7","author":"Ortenzi","year":"2020","journal-title":"Front. Robot. AI"},{"key":"10.1016\/j.rcim.2025.103064_b226","series-title":"Modeling human reaching phase in human-human object handover with application in robot-human handover","first-page":"3597","author":"Parastegari","year":"2017"},{"key":"10.1016\/j.rcim.2025.103064_b227","series-title":"Fast and Comfortable Interactive Robot-to-Human Object Handover","first-page":"3701","author":"Meng","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b228","series-title":"Optimizing Robot-to-Human Object Handovers using Vision-based Affordance Information","first-page":"1","author":"Lehotsky","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b229","series-title":"Object Transfer Point Estimation for Fluent Human-Robot Handovers","first-page":"2627","author":"Nemlekar","year":"2019"},{"key":"10.1016\/j.rcim.2025.103064_b230","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1109\/TIM.2021.3118090","article-title":"Object transfer point predicting based on human comfort model for human-robot handover","volume":"70","author":"Liu","year":"2021","journal-title":"IEEE Trans. Instrum. Meas."},{"issue":"2","key":"10.1016\/j.rcim.2025.103064_b231","doi-asserted-by":"crossref","first-page":"3136","DOI":"10.1109\/LRA.2021.3062808","article-title":"Affordance-Aware handovers with human arm mobility constraints","volume":"6","author":"Ardon","year":"2021","journal-title":"IEEE Robot. Autom. Lett."},{"key":"10.1016\/j.rcim.2025.103064_b232","series-title":"Human Grasp Classification for Reactive Human-to-Robot Handovers","first-page":"11123","author":"Yang","year":"2020"},{"key":"10.1016\/j.rcim.2025.103064_b233","series-title":"Reactive Human-to-Robot handovers of arbitrary objects","author":"Yang","year":"2021"},{"key":"10.1016\/j.rcim.2025.103064_b234","series-title":"HandoverSim: A simulation framework and benchmark for human-to-robot object handovers","author":"Chao","year":"2022"},{"key":"10.1016\/j.rcim.2025.103064_b235","series-title":"SynH2R: Synthesizing hand-object motions for learning human-to-robot handovers","author":"Christen","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b236","series-title":"GenH2R: Learning generalizable human-to-robot handover via scalable simulation, demonstration, and imitation","author":"Wang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b237","series-title":"How can large language models help humans in design and manufacturing?","author":"Makatura","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b238","doi-asserted-by":"crossref","DOI":"10.1016\/j.jmsy.2024.04.020","article-title":"An LLM-based vision and language cobot navigation approach for Human-centric smart manufacturing","author":"Wang","year":"2024","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b239","doi-asserted-by":"crossref","DOI":"10.1016\/j.aei.2023.102333","article-title":"CausalKGPT: Industrial structure causal knowledge-enhanced large language model for cause analysis of quality problems in aerospace product manufacturing","volume":"59","author":"Zhou","year":"2024","journal-title":"Adv. Eng. Inform."},{"issue":"5\u20136","key":"10.1016\/j.rcim.2025.103064_b240","doi-asserted-by":"crossref","first-page":"2461","DOI":"10.1007\/s00170-024-13861-9","article-title":"Generative AI and DT integrated intelligent process planning: a conceptual framework","volume":"133","author":"Xu","year":"2024","journal-title":"Int. J. Adv. Manuf. Technol."},{"key":"10.1016\/j.rcim.2025.103064_b241","doi-asserted-by":"crossref","first-page":"83","DOI":"10.1016\/j.jmsy.2024.02.015","article-title":"Empowering digital twins with large language models for global temporal feature learning","volume":"74","author":"Sun","year":"2024","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b242","doi-asserted-by":"crossref","first-page":"314","DOI":"10.1016\/j.jmsy.2024.08.003","article-title":"Industrial Metaverse: A proactive human-robot collaboration perspective","volume":"76","author":"Li","year":"2024","journal-title":"J. Manuf. Syst."},{"key":"10.1016\/j.rcim.2025.103064_b243","series-title":"Zero-shot peg insertion: Identifying mating holes and estimating SE(2) poses with vision-language models","author":"Yajima","year":"2025"},{"issue":"2","key":"10.1016\/j.rcim.2025.103064_b244","doi-asserted-by":"crossref","first-page":"1141","DOI":"10.1007\/s10845-023-02294-y","article-title":"Embodied intelligence in manufacturing: leveraging large language models for autonomous industrial robotics","volume":"36","author":"Fan","year":"2025","journal-title":"J. Intell. Manuf."},{"key":"10.1016\/j.rcim.2025.103064_b245","series-title":"NOIR: Neural signal operated intelligent robots for everyday activities","author":"Zhang","year":"2023"},{"issue":"1","key":"10.1016\/j.rcim.2025.103064_b246","doi-asserted-by":"crossref","first-page":"5","DOI":"10.1016\/j.cirp.2021.04.091","article-title":"Function block-based human-robot collaborative assembly driven by brainwaves","volume":"70","author":"Wang","year":"2021","journal-title":"CIRP Ann"},{"issue":"9","key":"10.1016\/j.rcim.2025.103064_b247","doi-asserted-by":"crossref","first-page":"3083","DOI":"10.1007\/s00170-021-07937-z","article-title":"A closed-loop brain-computer interface with augmented reality feedback for industrial human-robot collaboration","volume":"124","author":"Ji","year":"2023","journal-title":"Int. J. Adv. Manuf. Technol."},{"key":"10.1016\/j.rcim.2025.103064_b248","series-title":"Large brain model for learning generic representations with tremendous EEG data in BCI","author":"Jiang","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b249","series-title":"HallusionBench: An advanced diagnostic suite for entangled language hallucination and visual illusion in large vision-language models","author":"Guan","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b250","series-title":"Tools fail: Detecting silent errors in faulty tools","author":"Sun","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b251","series-title":"REFLECT: Summarizing robot experiences for failure explanation and correction","author":"Liu","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b252","series-title":"Self-corrected multimodal large language model for End-to-End robot manipulation","author":"Liu","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b253","series-title":"AIC MLLM: Autonomous interactive correction MLLM for robust robotic manipulation","author":"Xiong","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b254","series-title":"EgoThink: Evaluating first-person perspective thinking capability of vision-language models","author":"Cheng","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b255","series-title":"Simultaneous localization and affordance prediction for tasks in egocentric video","author":"Chavis","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b256","series-title":"SMART-LLM: Smart multi-agent robot task planning using large language models","author":"Kannan","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b257","series-title":"Roco: Dialectic multi-robot collaboration with large language models","author":"Mandi","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b258","series-title":"AutoRT: Embodied foundation models for large scale orchestration of robotic agents","author":"Ahn","year":"2024"},{"key":"10.1016\/j.rcim.2025.103064_b259","series-title":"ImageBind: One embedding space to bind them all","author":"Girdhar","year":"2023"},{"key":"10.1016\/j.rcim.2025.103064_b260","series-title":"Meta-transformer: A unified framework for multimodal learning","author":"Zhang","year":"2023"}],"container-title":["Robotics and Computer-Integrated Manufacturing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0736584525001188?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0736584525001188?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T04:55:33Z","timestamp":1757912133000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0736584525001188"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":260,"alternative-id":["S0736584525001188"],"URL":"https:\/\/doi.org\/10.1016\/j.rcim.2025.103064","relation":{},"ISSN":["0736-5845"],"issn-type":[{"value":"0736-5845","type":"print"}],"subject":[],"published":{"date-parts":[[2026,2]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Empowering natural human\u2013robot collaboration through multimodal language models and spatial intelligence: Pathways and perspectives","name":"articletitle","label":"Article Title"},{"value":"Robotics and Computer-Integrated Manufacturing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.rcim.2025.103064","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2025 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103064"}}