{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T16:38:19Z","timestamp":1757608699899,"version":"3.44.0"},"reference-count":68,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11128270","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"6527-6535","source":"Crossref","is-referenced-by-count":0,"title":["Chain-of-Modality: Learning Manipulation Programs from Multimodal Human Videos with Vision-Language-Models"],"prefix":"10.1109","author":[{"given":"Chen","family":"Wang","sequence":"first","affiliation":[{"name":"Google DeepMind"}]},{"given":"Fei","family":"Xia","sequence":"additional","affiliation":[{"name":"Google DeepMind"}]},{"given":"Wenhao","family":"Yu","sequence":"additional","affiliation":[{"name":"Google DeepMind"}]},{"given":"Tingnan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Google DeepMind"}]},{"given":"Ruohan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Stanford University"}]},{"given":"C. Karen","family":"Liu","sequence":"additional","affiliation":[{"name":"Stanford University"}]},{"given":"Li","family":"Fei-Fei","sequence":"additional","affiliation":[{"name":"Stanford University"}]},{"given":"Jie","family":"Tan","sequence":"additional","affiliation":[{"name":"Google DeepMind"}]},{"given":"Jacky","family":"Liang","sequence":"additional","affiliation":[{"name":"Google DeepMind"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023","journal-title":"arXiv preprint"},{"key":"ref2","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022","journal-title":"arXiv preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2022.XVIII.026"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01324"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610288"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_20"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2021.xvii.012"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"ref10","article-title":"Mobility vla: Multimodal instruction navigation with long-context vlms and topological graphs","author":"Chiang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref11","first-page":"893","article-title":"Can foundation models perform zero-shot task specification for robot manipulation?","volume-title":"Learning for dynamics and control conference","author":"Cui"},{"article-title":"Scaling egocentric vision: The epic-kitchens dataset","volume-title":"European Conference on Computer Vision (ECCV)","author":"Damen","key":"ref12"},{"key":"ref13","first-page":"1930","article-title":"Model-based inverse reinforcement learning from visual demonstrations","volume-title":"Conference on Robot Learning","author":"Das"},{"key":"ref14","article-title":"Perceptual values from observation","author":"Edwards","year":"2019","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1177\/02783649241281508"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"journal-title":"Rt-trajectory: Robotic task generalization via hindsight trajectory sketches","year":"2023","author":"Gu","key":"ref18"},{"key":"ref19","article-title":"Toward general-purpose robots via foundation models: A survey and meta-analysis","author":"Hu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00876"},{"key":"ref21","first-page":"9118","article-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents","volume-title":"International conference on machine learning","author":"Huang"},{"key":"ref22","article-title":"Rekep: Spatio-temporal reasoning of relational keypoint constraints for robotic manipulation","author":"Huang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref23","article-title":"Inner monologue: Embodied reasoning through planning with language models","author":"Huang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.052"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01594-9"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2430335"},{"key":"ref28","first-page":"55","article-title":"Graph inverse reinforcement learning from diverse videos","volume-title":"Conference on Robot Learning","author":"Kumar"},{"key":"ref29","article-title":"Hake: Human activity knowledge engine","author":"Li","year":"2019","journal-title":"arXiv preprint"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.125"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-023-10131-7"},{"key":"ref33","article-title":"Llm+ p: Empowering large language models with optimal planning proficiency","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8462901"},{"key":"ref35","article-title":"Vip: Towards universal visual reward and representation via value-implicit pre-training","author":"Ma","year":"2022","journal-title":"arXiv preprint"},{"key":"ref36","article-title":"Pivot: Iterative visual prompting elicits actionable knowledge for vlms","author":"Nasiriany","year":"2024","journal-title":"arXiv preprint"},{"key":"ref37","article-title":"R+ x: Retrieval and execution from everyday human videos","author":"Papagiannis","year":"2024","journal-title":"arXiv preprint"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00938"},{"key":"ref39","article-title":"Planning with large language models via corrective re-prompting","volume-title":"NeurIPS 2022 Foundation Models for Decision Making Workshop","author":"Raman","year":"2022"},{"key":"ref40","article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","author":"Reid","year":"2024","journal-title":"arXiv preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.399"},{"key":"ref42","article-title":"Reinforcement learning with videos: Combining offline observations with interaction","author":"Schmeckpeper","year":"2020","journal-title":"arXiv preprint"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_42"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2017.xiii.050"},{"key":"ref46","first-page":"492","article-title":"Lm-nav: Robotic navigation with large pre-trained models of language, vision, and action","volume-title":"Conference on robot learning","author":"Shah"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1177\/02783649211046285"},{"key":"ref48","article-title":"Third-person visual imitation learning via decoupled hierarchical controller","volume":"32","author":"Sharma","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref49","article-title":"Videodex: Learning dexterity from internet videos","author":"Shaw","year":"2022","journal-title":"CoRL"},{"key":"ref50","first-page":"894","article-title":"Cliport: What and where pathways for robotic manipulation","volume-title":"Conference on robot learning","author":"Shridhar"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i18.30006"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2020.xvi.024"},{"journal-title":"Rt-sketch: Goal-conditioned imitation learning from hand-drawn sketches","year":"2024","author":"Sundaresan","key":"ref54"},{"key":"ref55","article-title":"Mimicplay: Long-horizon imitation learning by watching human play","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2868668"},{"key":"ref57","article-title":"Describe, explain, plan and select: Interactive planning with large language models enables open-world multi-task agents","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2022.XVIII.044"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2024.xx.092"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9636080"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.617"},{"key":"ref62","first-page":"3536","article-title":"Xskill: Cross embodiment skill discovery","volume-title":"Conference on Robot Learning","author":"Xu"},{"issue":"1","key":"ref63","first-page":"1","article-title":"The dawn of lmms: Preliminary explorations with gpt-4v(ision)","volume":"9","author":"Yang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref64","article-title":"Language to rewards for robotic skill synthesis","volume-title":"arXiv preprint","author":"Yu","year":"2023"},{"key":"ref65","first-page":"537","article-title":"Xirl: Cross-embodiment inverse reinforcement learning","volume-title":"Conference on Robot Learning","author":"Zakka"},{"key":"ref66","article-title":"Socratic models: Composing zero-shot multimodal reasoning with language","author":"Zeng","year":"2022","journal-title":"arXiv preprint"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"ref68","article-title":"Vision-based manipulation from single human video with open-world object graphs","author":"Zhu","year":"2024","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","start":{"date-parts":[[2025,5,19]]},"location":"Atlanta, GA, USA","end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11128270.pdf?arnumber=11128270","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:35:25Z","timestamp":1756881325000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11128270\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":68,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11128270","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}