{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T16:38:49Z","timestamp":1779295129529,"version":"3.51.4"},"reference-count":32,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11128322","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"8284-8290","source":"Crossref","is-referenced-by-count":4,"title":["R+X: Retrieval and Execution from Everyday Human Videos"],"prefix":"10.1109","author":[{"given":"Georgios","family":"Papagiannis","sequence":"first","affiliation":[{"name":"Imperial College London,The Robot Learning Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Norman","family":"Di Palo","sequence":"additional","affiliation":[{"name":"Imperial College London,The Robot Learning Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pietro","family":"Vitiello","sequence":"additional","affiliation":[{"name":"Imperial College London,The Robot Learning Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Edward","family":"Johns","sequence":"additional","affiliation":[{"name":"Imperial College London,The Robot Learning Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Apple vision pro."},{"key":"ref2","volume-title":"Meta. Meta quest 3"},{"key":"ref3","volume-title":"Ray-ban smart glasses","author":"Ray-Ban"},{"key":"ref4","volume-title":"MagicLeap. Magicleap"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.052"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2022.XVIII.026"},{"key":"ref7","volume-title":"Bc-z: Zero-shot task generalization with robotic imitation learning","author":"Jang","year":"2022"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/IROS58592.2024.10801982"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.078"},{"key":"ref10","article-title":"Vision-based manipulation from single human video with open-world object graphs","author":"Zhu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.043"},{"key":"ref12","article-title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","author":"Brohan","year":"2023","journal-title":"arXiv e-prints"},{"key":"ref13","first-page":"8469","article-title":"PaLM-e: An embodied multimodal language model","volume-title":"Proceedings of the 40th International Conference on Machine Learning","volume":"202","author":"Driess"},{"key":"ref14","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown","year":"2020"},{"key":"ref15","article-title":"Gemini: A family of highly capable multimodal models","year":"2023","journal-title":"Gemini-Team"},{"key":"ref16","article-title":"Large Language Models as General Pattern Machines","author":"Mirchandani","year":"2023","journal-title":"arXiv e-prints"},{"key":"ref17","article-title":"Training Compute-Optimal Large Language Models","author":"Hoffmann","year":"2022","journal-title":"arXiv e-prints"},{"key":"ref18","year":"2023","journal-title":"Open x-embodiment: Robotic learning datasets and rt-x models"},{"key":"ref19","article-title":"Mimicplay: Long-horizon imitation learning by watching human play","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2024.xx.096"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3130800.3130883"},{"key":"ref22","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","year":"2024","journal-title":"G. Team"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref24","volume-title":"Deep vit features as dense visual descriptors","author":"Amir","year":"2022"},{"key":"ref25","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"ref26","article-title":"GPT-4 Technical Report","year":"2023","journal-title":"arXiv e-prints"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.00938"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.90"},{"key":"ref29","volume-title":"R3m: A universal visual representation for robot manipulation","author":"Nair","year":"2022"},{"key":"ref30","volume-title":"Octo: An open-source generalist robot policy","author":"Team","year":"2024"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.026"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3367329"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","location":"Atlanta, GA, USA","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11128322.pdf?arnumber=11128322","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:15:25Z","timestamp":1756880125000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11128322\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11128322","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}