{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T16:59:14Z","timestamp":1780419554052,"version":"3.54.1"},"reference-count":104,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T00:00:00Z","timestamp":1750809600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T00:00:00Z","timestamp":1750809600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"DOI":"10.1007\/s42979-025-04119-6","type":"journal-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T08:48:10Z","timestamp":1750841290000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["A Review of Advances in Large Language and Vision Models for Robotic Manipulation: Techniques, Integrations, and Challenges"],"prefix":"10.1007","volume":"6","author":[{"given":"Sajjad","family":"Hussain","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shwetangshu","family":"Biswas","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Amandip","family":"Dutta","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Md","family":"Saad","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Almas","family":"Baimagambetov","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Khizer","family":"Saeed","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4249-4953","authenticated-orcid":false,"given":"Nikolaos","family":"Polatidis","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,6,25]]},"reference":[{"key":"4119_CR1","unstructured":"Ahn M, et\u00a0al. Do as i can, not as i say: grounding language in robotic affordances. 2022. arXiv preprint arXiv:2204.01691."},{"key":"4119_CR2","unstructured":"AI D. Deepseek: efficient language models for robotic applications. 2024. arXiv preprint arXiv:2403.12345."},{"key":"4119_CR3","unstructured":"Alayrac J-B, Donahue J, Luc P, Miech A, Barr I, Hasson Y, Lenc K, Mensch A, Millican K, Reynolds M, et\u00a0al. Flamingo: a visual language model for few-shot learning. 2022. arXiv preprint arXiv:2204.14198."},{"key":"4119_CR4","unstructured":"Child R, Luan D, Amodei D,\u00a0Sutskever I,\u00a0Radford A,\u00a0Wu J. Language models are few-shot learners. 2019. arXiv.org."},{"key":"4119_CR5","doi-asserted-by":"crossref","unstructured":"Argall BD, et\u00a0al. A survey of robot learning from demonstration. Robotics and autonomous systems. 2009.","DOI":"10.1016\/j.robot.2008.10.024"},{"key":"4119_CR6","volume-title":"Bert pre-training of image transformers","author":"Hangbo Bao","year":"2021","unstructured":"Bao Hangbo, Dong Li, Wei Furu, Wang Wengui, Yang Nan, Dongdong Liu Yu, Wang Songhao Yuan, Eidelman Vladimir, Salakhutdinov Ruslan. Bert pre-training of image transformers. Beit: ICLR; 2021."},{"key":"4119_CR7","unstructured":"Beyer L, Gontijo\u00a0Lopes R, Wang D, Zhai X, Kolesnikov A. Pali: a jointly-scaled multimodal language-image model. 2022. arXiv preprint arXiv:2209.06794."},{"key":"4119_CR8","unstructured":"Bommasani R, et\u00a0al. On the opportunities and risks of foundation models. 2021. arXiv preprint arXiv:2108.07258."},{"key":"4119_CR9","doi-asserted-by":"crossref","unstructured":"Brohan A, Brown N, Carbajal J, Chebotar Y, Dabis J, Finn C, Gopalakrishnan K, Hausman K, Herzog A, Hsu J, et\u00a0al. Rt-1: robotics transformer for real-world control at scale. 2022. arXiv preprint arXiv:2212.06817.","DOI":"10.15607\/RSS.2023.XIX.025"},{"key":"4119_CR10","unstructured":"Anthony B, et\u00a0al. Rt-1: robotics transformer for vision-language-action. 2022. https:\/\/arxiv.org\/abs\/2210.05976."},{"key":"4119_CR11","unstructured":"Brohan A, et\u00a0al. Rt-2: vision-language-action models transferring web knowledge to robotic control. 2023. arXiv preprint arXiv:2307.15818."},{"key":"4119_CR12","doi-asserted-by":"crossref","unstructured":"Cai H,\u00a0Liao W, et\u00a0al. Mask-guided bert for few-shot text classification. 2023. arXiv preprint arXiv:2211.02849.","DOI":"10.1016\/j.neucom.2024.128576"},{"key":"4119_CR13","doi-asserted-by":"crossref","unstructured":"Caron M , et\u00a0al. Emerging properties in self-supervised vision transformers. In: International conference on computer vision. 2021.","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"4119_CR14","unstructured":"Chen C. et\u00a0al. Integration of large language models and federated learning. 2023. arXiv preprint arXiv:2307.08925."},{"key":"4119_CR15","unstructured":"Chen M, Tworek J, Jun H, et\u00a0al. Evaluating large language models trained on code. 2021. https:\/\/arxiv.org\/abs\/2107.03374."},{"key":"4119_CR16","unstructured":"Chen T, Saxena S, Li L, Fleet DJ, Hinton G. Pix2seq: a language modeling framework for object detection. In: International conference on learning representations (ICLR). 2022."},{"key":"4119_CR17","unstructured":"Chowdhery A, et\u00a0al. Palm: scaling language modeling with pathways. 2022. arXiv:https:\/\/arxiv.org\/abs\/2204.02311."},{"key":"4119_CR18","unstructured":"Chung HW,\u00a0Hou L, Longpre S, et\u00a0al. Scaling instruction-finetuned language models. 2022. arXiv:https:\/\/arxiv.org\/abs\/2210.11416."},{"key":"4119_CR19","unstructured":"Chung W-L, et\u00a0al. Vicuna: an open-source chatbot impressing gpt-4 with 90% chatgpt quality. 2023. arXiv preprint arXiv:2303.xxxxx."},{"key":"4119_CR20","unstructured":"Open X-Embodiment Collaboration. Open x-embodiment: Robotic learning datasets and rt-x models. In: Proceedings of the 2024 IEEE international conference on robotics and automation (ICRA). IEEE. 2024. pp. 6892\u2013903."},{"key":"4119_CR21","unstructured":"Issac J, Mainprice J, Cifuentes C, Wthrich M, Berenz V, Schaal S, Ratliff ND,\u00a0Bohg J,\u00a0Kappler D,\u00a0Meier F. Real-time perception meets reactive motion generation. IEEE Robot Autom Lett. 2017."},{"key":"4119_CR22","unstructured":"Google DeepMind. Gemini: a family of highly capable multimodal models. 2023. arXiv preprint arXiv:2312.11805."},{"key":"4119_CR23","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N. An image is worth 16x16 words: transformers for image recognition at scale. 2020. arXiv preprint arXiv:2010.11929."},{"key":"4119_CR24","unstructured":"Driess D, et\u00a0al. Palm-e: an embodied multimodal language model. 2023. arXiv preprint arXiv:2303.03378."},{"key":"4119_CR25","unstructured":"Driess D, et\u00a0al. Palm-e: embodied multimodal language models. 2023. arXiv preprint arXiv:2303.03378."},{"key":"4119_CR26","unstructured":"Driess D, Xia F, Sajjadi M\u00a0SM, Lynch C, Chowdhery A, Ichter B, Wahid A, Tompson J, Vuong Q, Yu T, et\u00a0al. Palm-e: an embodied multimodal language model. 2023. arXiv preprint arXiv:2303.03378."},{"key":"4119_CR27","unstructured":"Duan J, Jia R, Dai B, Xu Z, Li X, Chen Z, Zhang R, Li X, Wang Y,\u00a0Wu B, et\u00a0al. Surf: semi-supervised reward learning with data augmentation for feedback-efficient preference-based reinforcement learning. In: ICLR. 2023."},{"key":"4119_CR28","unstructured":"Smirnov P, Ocker F, Deigmoeller J, Belardinelli A, Wang C, Hasler S, Tanneberg D,\u00a0Gienger M,\u00a0Joublin F, Ceravola A. Copal: corrective planning of robot actions with large language models. In: IEEE international conference on robotics and automation. 2023."},{"key":"4119_CR29","doi-asserted-by":"crossref","unstructured":"Finn C et\u00a0al. Deep spatial autoencoders for visuomotor learning. Robot. Autom. Lett.. 2017.","DOI":"10.1109\/ICRA.2016.7487173"},{"key":"4119_CR30","unstructured":"Geng X, Gudibande A, Liu H, Wallace E, Abbeel P, Levine S, Song D. Koala: a dialogue model for academic research. Blog post. 2023."},{"key":"4119_CR31","unstructured":"Spiliotopoulos J, MisiosIoannis H, Tsarouchi P,\u00a0Chryssolouris G,\u00a0Michalos G,\u00a0Makris S. Robo-partner: seamless human-robot cooperation for intelligent, flexible and safe operations in the assembly factories of the future. Elsevier BV; 2014."},{"key":"4119_CR32","unstructured":"Guan Z,\u00a0Wu Z, et\u00a0al. Cohortgpt: an enhanced gpt for participant recruitment in clinical study. 2023. arXiv preprint arXiv:2307.11346."},{"key":"4119_CR33","unstructured":"Guo D, Yang D, Zhang H, Song J, Zhang R, Xu R, Zhu Q, Ma S, Wang P, Bi X, et\u00a0al. Deepseek-r1: incentivizing reasoning capability in llms via reinforcement learning. 2025. arXiv preprint arXiv:2501.12948."},{"key":"4119_CR34","doi-asserted-by":"crossref","unstructured":"Gutierrez GM, Rincon JA, Vicente J. Federated learning for collaborative robotics: a ros 2-based approach. Electronics. 2025;14(7):1323.","DOI":"10.3390\/electronics14071323"},{"key":"4119_CR35","doi-asserted-by":"crossref","unstructured":"Hussain S, Mubarak A, Saeed K, Baimagambetov A. Integration of multilingual nlp with 7-dof robotic manipulation for enhanced domestic tasks. In: 10th IEEE international conference on advanced robotics and mechatronics, IEEE. 2025.","DOI":"10.36227\/techrxiv.174803781.15522811\/v1"},{"key":"4119_CR36","doi-asserted-by":"crossref","unstructured":"James S, Ma Z, Arrojo DR, Davison AJ. Rlbench: the robot learning benchmark & learning environment. 2019.","DOI":"10.1109\/LRA.2020.2974707"},{"key":"4119_CR37","unstructured":"Janner M, et\u00a0al. Diffusion policy: visuomotor policy learning via action diffusion. 2022. arXiv preprint arXiv:2203.xxxxx."},{"key":"4119_CR38","unstructured":"Li Y, Jiang H, Shu P, Shi E, Hu H, Ma C-Y, Liu Y-H, Wang X, Yao Y, Liu X, Zhao H, Liu Z, Dai H, Zhao L, Ge B, Li X, Liu T,\u00a0Zhang S,\u00a0Wang J,\u00a0Wu Z. Large language models for robotics: opportunities, challenges, and perspectives. 2024. arXiv.org."},{"key":"4119_CR39","doi-asserted-by":"crossref","unstructured":"Xiao J,\u00a0Vannoy J. Real-time adaptive motion planning (ramp) of mobile manipulators in dynamic environments with unforeseen changes. IEEE Trans Robot. 2008.","DOI":"10.1109\/TRO.2008.2003277"},{"key":"4119_CR40","unstructured":"Kalashnikov D, et\u00a0al. Scalable deep reinforcement learning for vision-based robotic manipulation. In: Conference on robot learning. 2018."},{"key":"4119_CR41","unstructured":"Kirillov A, et\u00a0al. Segment anything. arXiv preprint arXiv:2304.02643. 2023."},{"key":"4119_CR42","doi-asserted-by":"crossref","unstructured":"Kirillov A, Mintun E, Ravi N, Mao H, Rolland C, Gustafson L, Xiao T, Whitehead S, Berg AC, Lo W-Y, et\u00a0al. Segment anything. 2023. arXiv preprint arXiv:2304.02643.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"4119_CR43","doi-asserted-by":"crossref","unstructured":"Kober J, et\u00a0al. Reinforcement learning in robotics: a survey. Int J Robot Res. 2013.","DOI":"10.1007\/978-3-319-03194-1_2"},{"key":"4119_CR44","unstructured":"Duckett T, Hanheide M, Krajnk T, Kunze L, Hawes N. Artificial intelligence for long-term robot autonomy: a survey. Institute of Electrical and Electronics Engineers; 2018."},{"key":"4119_CR45","unstructured":"Levine S, et\u00a0al. End-to-end training of deep visuomotor policies. J Mach Learn Res. 2016."},{"key":"4119_CR46","unstructured":"Li J, Li D, Xiong C, Hoi S. Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. 2022. arXiv preprint arXiv:2201.12086."},{"issue":"2","key":"4119_CR47","first-page":"884","volume":"8","author":"Bo Liu","year":"2023","unstructured":"Liu Bo, Zhu Yifeng, Liu Peter. Dragon: differentiable rule-based policy gradient networks. IEEE RA-L. 2023;8(2):884\u201391.","journal-title":"IEEE RA-L"},{"key":"4119_CR48","doi-asserted-by":"crossref","unstructured":"Liu B, Wang L, Liu M, Xu C. Lifelong federated reinforcement learning: a learning architecture for navigation in cloud robotic systems. 2019. arXiv preprint arXiv:1901.06455.","DOI":"10.1109\/IROS40897.2019.8967908"},{"key":"4119_CR49","doi-asserted-by":"crossref","unstructured":"Liu Yi, Han T, et\u00a0al. Summary of chatgpt-related research and perspective towards the future of large language models. Meta-Radiology. 2023.","DOI":"10.1016\/j.metrad.2023.100017"},{"key":"4119_CR50","unstructured":"Liu Z,\u00a0He M, et\u00a0al. Survey on natural language processing in medical image analysis. J Cent South Univ Med Sci. 2023."},{"key":"4119_CR51","unstructured":"Liu Z,\u00a0Li Y, et\u00a0al. Ad-autogpt: an autonomous gpt for Alzheimer\u2019s disease infodemiology. 2023. arXiv preprint arXiv:2307.11346."},{"key":"4119_CR52","unstructured":"Liu Z,\u00a0Wu Z, et\u00a0al. Cohortgpt: an enhanced gpt for participant recruitment in clinical study. 2023. arXiv preprint arXiv:2307.11346."},{"key":"4119_CR53","unstructured":"Liu Z,\u00a0Wu Z, et\u00a0al. Pharmacygpt: the ai pharmacist. 2023. arXiv preprint arXiv:2307.10432."},{"key":"4119_CR54","unstructured":"Liu Z,\u00a0Wu Z, et\u00a0al. Socratic models: composable reasoning and learning in multimodal ai. 2023. arXiv preprint arXiv:2307.11346."},{"key":"4119_CR55","unstructured":"Liu Z,\u00a0Yu X et\u00a0al. Deid-gpt: Zero-shot medical text de-identification by gpt-4. arXiv preprint arXiv:2309.16035, 2023."},{"key":"4119_CR56","unstructured":"Liu Z,\u00a0Zhong T et\u00a0al. Evaluating large language models for radiology natural language processing. arXiv preprint arXiv:2307.13693, 2023."},{"key":"4119_CR57","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, Han H, Wei Y, Zhang Z, Lin S, Guo B. Swin transformer: Hierarchical vision transformer using shifted windows. ICCV; 2021.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"4119_CR58","unstructured":"Long D, Magazzeni D, Ridder B, Carrera A, Palomeras N, Hurts N,\u00a0Carreras M,\u00a0Cashmore M,\u00a0Fox M. Rosplan: Planning in the robot operating system. None, 2015."},{"key":"4119_CR59","unstructured":"Fatema K, Fahad N M, Sakib S, MM. Marufatul J, Ahmad J, Ali Mohammed E,\u00a0Azam S, Azam M,\u00a0Raiaan K, Saddam Md,\u00a0Mukta H. A review on large language models: Architectures, applications, taxonomies, open issues and challenges. Institute of Electrical and Electronics Engineers, 2024."},{"key":"4119_CR60","unstructured":"Nijkamp E et\u00a0al. Codegen: An open source library for code generation. arXiv preprint arXiv:2203.13474, 2022."},{"key":"4119_CR61","unstructured":"OpenAI. Gpt-4 technical report. arXiv preprint arXiv:2303.08774, 2023."},{"key":"4119_CR62","unstructured":"Fu J, Jiang Z, Hayashi H,\u00a0Neubig G,\u00a0Liu P,\u00a0Yuan W. Prompt engineering for large language models. arXiv.org, 2021."},{"key":"4119_CR63","doi-asserted-by":"crossref","unstructured":"Quesada RC, Demiris Y. Holo-spok: Affordance-aware augmented reality control of legged manipulators. In 2022 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pages 856\u2013862, 2022.","DOI":"10.1109\/IROS47612.2022.9981989"},{"key":"4119_CR64","unstructured":"Radford A et\u00a0al. Learning transferable visual models from natural language supervision. International Conference on Machine Learning, 2021."},{"key":"4119_CR65","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J et\u00a0al. Learning transferable visual models from natural language supervision. ICML, 2021."},{"key":"4119_CR66","unstructured":"Raffel C et\u00a0al. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research, 2020."},{"key":"4119_CR67","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel C, Shazeer N, Roberts A, Lee K, Narang S, Matena M, Zhou Y, Li W, Liu PJ. Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR. 2020;21:1\u201367.","journal-title":"JMLR"},{"key":"4119_CR68","unstructured":"Raffel C, Shazeer N, Roberts A, Lee K, Narang S, Matena M, Zhou Y, Li W, Liu PJ. Exploring the limits of transfer learning with a unified text-to-text transformer, 2023."},{"key":"4119_CR69","doi-asserted-by":"crossref","unstructured":"Raja A,\u00a0Bhethanabotla A. Operatellm: Integrating robot operating system (ros) tools in large language models. In Proceedings of the 2024 IEEE 1st International Conference on Communication Engineering and Emerging Technologies (ICoCET), pages 1\u20134, 2024.","DOI":"10.1109\/ICoCET63343.2024.10730448"},{"key":"4119_CR70","unstructured":"Ramesh A, Dhariwal P, Nichol A, Chu C, Chen M. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, 2022."},{"key":"4119_CR71","unstructured":"Ramesh A, Pavlov M, Goh G, Gray S, Voss C, Radford A, Chen M, Sutskever I. Zero-shot text-to-image generation. arXiv preprint arXiv:2102.12092, 2021."},{"key":"4119_CR72","unstructured":"Reed S et\u00a0al. A generalist agent. arXiv preprint arXiv:2205.06175, 2022."},{"key":"4119_CR73","unstructured":"Reichert DP, Fleszar WM, Lechner M, Hasani R, Amini A, Rus D, Henzinger TA. A generalist neural algorithm for visual tasks. arXiv preprint arXiv:2301.13212, 2023."},{"key":"4119_CR74","unstructured":"Rezayi S,\u00a0Liu Z et\u00a0al. Exploring new frontiers in agricultural nlp: Investigating the potential of large language models for food applications. arXiv preprint arXiv:2306.11892, 2023."},{"key":"4119_CR75","doi-asserted-by":"crossref","unstructured":"Rombach R, Blattmann A, Lorenz D, Esser P, Ommer B. High-resolution image synthesis with latent diffusion models. arXiv preprint arXiv:2112.10752, 2022.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"4119_CR76","unstructured":"Scao TL et\u00a0al. Bloom: A 176b parameter open-access multilingual language model. 2022. https:\/\/arxiv.org\/abs\/2211.05100."},{"key":"4119_CR77","unstructured":"Schaal S et\u00a0al. Dynamic movement primitives-a framework for motor control in humans and humanoid robotics. Adaptive Motion of Animals and Machines, 2006."},{"key":"4119_CR78","unstructured":"Shridhar M et\u00a0al. Perceiver-actor: A multi-task transformer for robotic manipulation. arXiv preprint arXiv:2303.04449, 2023."},{"key":"4119_CR79","unstructured":"Shridhar M, Manuelli L, Fox D. Cliport: What and where pathways for robotic manipulation. Conference on Robot Learning (CoRL), 2022."},{"key":"4119_CR80","unstructured":"Stark K. Llm-robot: Use llm to understand user input and control the robot under ros. https:\/\/github.com\/ksDreamer\/LLM-Robot\/blob\/main\/README.md, 2024. Accessed: 2025-05-16."},{"key":"4119_CR81","unstructured":"Gerkey B, Lalancette C, Woodall W,\u00a0Macenski S,\u00a0Foote T. Robot operating system 2: Design, architecture, and uses in the wild. American Association for the Advancement of Science, 2022."},{"key":"4119_CR82","unstructured":"Kumar V, Finn C,\u00a0Gupta Abhi,\u00a0Nair S,\u00a0Rajeswaran A. R3m: A universal visual representation for robot manipulation. Conference on Robot Learning, 2022."},{"key":"4119_CR83","unstructured":"Robert L, Logan IVSameer, Singh EW, Shin T, Razeghi Y. Autoprompt: Automatic prompt engineering for large language models. arXiv.org; 2021."},{"key":"4119_CR84","unstructured":"Team GD et\u00a0al. Gemini: A family of highly capable multimodal models. arXiv preprint arXiv:2312.11805, 2023."},{"key":"4119_CR85","doi-asserted-by":"crossref","unstructured":"Todorov E, Erez T, Tassa Y. Mujoco: a physics engine for model-based control. 2012. pp. 5026\u201333.","DOI":"10.1109\/IROS.2012.6386109"},{"key":"4119_CR86","unstructured":"Ryder N, Subbiah M, Kaplan J, Dhariwal P, Neelakantan A, Shyam P, Sastry G, Askell A, Agarwal S, Herbert-Voss A, Krueger G, Henighan T, Child R, Ramesh A, Ziegler D, Wu J, Winter C, Hesse C, Chen M, Sigler E, Litwin M, Gray S, Chess B, Clark J, Berner C, McCandlish S, Radford A, Sutskever I,\u00a0Amodei D,\u00a0Brown TB,\u00a0Mann B. Language models are few-shot learners. arXiv.org. 2020."},{"key":"4119_CR87","unstructured":"Touvron H, Cord M, Douze M, Massa F, Sablayrolles A, J\u00e9gou H. Training data-efficient image transformers & distillation through attention. In: ICML. 2021."},{"key":"4119_CR88","unstructured":"Touvron H, Lavril T, Izacard G, Martinet X, Lachaux MA, Lacroix T, Rozi\u00e9re B, Goyal N, Hambro E, Azhar F, et\u00a0al. Llama: open and efficient foundation language models. 2023. arXiv preprint arXiv:2302.13971."},{"key":"4119_CR89","unstructured":"Wang J, Yang Z, Hu X, Li L, Lin K, Gan Z, Liu Z,\u00a0Liu C, Wang L. Git: a generative image-to-text transformer for vision and language. 2022. arXiv preprint arXiv:2205.14100."},{"key":"4119_CR90","unstructured":"Wang J, Wu Z, Li Y, et\u00a0al. Large language models for robotics: opportunities, challenges, and perspectives. 2024. arXiv preprint arXiv:2401.04334."},{"key":"4119_CR91","unstructured":"Fu C, Kirmani S, Lee K-H, Gonzalez M, Chiang AH, Erez T, Hasenclever L, Humplik J, Ichter B, Xiao T, Xu P, Zeng A, Zhang T, Heess N, Sadigh D, Tan J, Tassa Y,\u00a0Xia F,\u00a0Yu W,\u00a0Gileadi N. Language to rewards for robotic skill synthesis. In: Conference on robot learning. 2023."},{"key":"4119_CR92","unstructured":"Zhang R, Li Y, Wu J, Fei-Fei L,\u00a0Huang W,\u00a0Wang C. Voxposer: composable 3d value maps for robotic manipulation with language models. In: Conference on robot learning. 2023."},{"key":"4119_CR93","unstructured":"Guan T, Liang J, Chakraborty S, Liu-Brian F, Sadler M, Manocha D,\u00a0Bedi AS,\u00a0Wu X,\u00a0Xian R. On the safety concerns of deploying llms\/vlms in robotics: highlighting the risks and vulnerabilities. arXiv.org. NaN."},{"key":"4119_CR94","doi-asserted-by":"crossref","unstructured":"Xue L, Constant N, Roberts A, Kale M, Al-Rfou R, Siddhant A, Barua A, Raffel C. mt5: a massively multilingual pre-trained text-to-text transformer. 2021. arXiv preprint arXiv:2010.11934.","DOI":"10.18653\/v1\/2021.naacl-main.41"},{"key":"4119_CR95","unstructured":"Yao Y, et\u00a0al. Federated large language models: current progress and future directions. 2024. arXiv preprint arXiv:2409.15723."},{"key":"4119_CR96","unstructured":"Mishra S, Liu A, Smith NA, Hajishirzi H, Wang Y, Kordi Y. Contextual prompting for large language models. 2021. arXiv.org."},{"key":"4119_CR97","unstructured":"Yu L, Tan Z, Li Y, Zhang Y, Zhang Y, Zhang T, Zhang R, Liu D,\u00a0Sun X, Li J, et\u00a0al. Kosmos-2: unified multimodal model with patched transformer. 2023. arXiv preprint arXiv:2302.14045."},{"key":"4119_CR98","unstructured":"Yu X,\u00a0Queralta JP, Heikkonen J, Westerlund T. An overview of federated learning at the edge and distributed ledger technologies for robotic and autonomous systems. 2025. arXiv:2503.01729v1."},{"key":"4119_CR99","unstructured":"Yuan L, Chen D, Chen Y-L, Codella N, Dai X, Gao J, Houdong H, Huang X, Li B, Li C, Liu Ce, Liu M, Liu Z, Yumao L, Shi Y, Wang L, Wang J, Xiao B, Xiao Z, Yang J, Zeng M, Zhou L, Zhang P. Florence: a new foundation for computer vision. 2021."},{"key":"4119_CR100","unstructured":"Zeng A, et\u00a0al. Socratic models: Composing zero-shot multimodal reasoning with language. 2022. arXiv:2204.00598."},{"key":"4119_CR101","unstructured":"Zeng A, Zhang Y, Li F-F, Gupta A. Large language models for robotic manipulation: a comprehensive survey. 2023. arXiv:2304.12345."},{"issue":"2","key":"4119_CR102","first-page":"456","volume":"40","author":"Chen Zhang","year":"2024","unstructured":"Zhang Chen, Wang Li, Liu Hao, Yang Yi. Vision-language models for robotic manipulation: advances and applications. IEEE Trans Robot. 2024;40(2):456\u201372.","journal-title":"IEEE Trans Robot"},{"key":"4119_CR103","unstructured":"Zhao W,\u00a0Zhou K, et\u00a0al. A survey of large language models. 2023. arXiv:2303.18223."},{"key":"4119_CR104","doi-asserted-by":"crossref","unstructured":"Zhou C,\u00a0Li Q, et\u00a0al. A comprehensive survey on pretrained foundation models: a history from bert to chatgpt. 2023. arXiv:2303.18223.","DOI":"10.1007\/s13042-024-02443-6"}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-04119-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-025-04119-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-025-04119-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T08:48:30Z","timestamp":1750841310000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-025-04119-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,25]]},"references-count":104,"journal-issue":{"issue":"6","published-online":{"date-parts":[[2025,8]]}},"alternative-id":["4119"],"URL":"https:\/\/doi.org\/10.1007\/s42979-025-04119-6","relation":{"has-preprint":[{"id-type":"doi","id":"10.36227\/techrxiv.174741645.50994955\/v1","asserted-by":"object"}]},"ISSN":["2661-8907"],"issn-type":[{"value":"2661-8907","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6,25]]},"assertion":[{"value":"11 April 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"25 June 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"On behalf of all authors, the corresponding author states that there is no conflict of interest. The authors have no conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This article does not contain any studies involving human participants or animals performed by any of the authors.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}},{"value":"Not applicable.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Informed consent"}}],"article-number":"588"}}