{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T19:29:15Z","timestamp":1772825355284,"version":"3.50.1"},"reference-count":44,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Other Transaction Award","award":["HR00112490375"],"award-info":[{"award-number":["HR00112490375"]}]},{"DOI":"10.13039\/100000185","name":"Defense Advanced Research Projects Agency","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Friction for Accountability in Conversational Transactions (FACT) Program"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Robot. Autom. Lett."],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1109\/lra.2025.3527290","type":"journal-article","created":{"date-parts":[[2025,1,8]],"date-time":"2025-01-08T15:34:06Z","timestamp":1736350446000},"page":"2287-2294","source":"Crossref","is-referenced-by-count":3,"title":["MotIF: Motion Instruction Fine-Tuning"],"prefix":"10.1109","volume":"10","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6548-0071","authenticated-orcid":false,"given":"Minyoung","family":"Hwang","sequence":"first","affiliation":[{"name":"Massachusetts Institute of Technology, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6339-3426","authenticated-orcid":false,"given":"Joey","family":"Hejna","sequence":"additional","affiliation":[{"name":"Stanford University, Stanford, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7802-9183","authenticated-orcid":false,"given":"Dorsa","family":"Sadigh","sequence":"additional","affiliation":[{"name":"Google Deepmind, Mountain View, CA, USA"}]},{"given":"Yonatan","family":"Bisk","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Homerobot: Open-vocabulary mobile manipulation","author":"Yenamandra","year":"2023","journal-title":". Int. Conf. Neural Inf. Process. Syst. Competition"},{"key":"ref2","article-title":"Vision-language models as success detectors","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Du","year":"2023"},{"key":"ref3","article-title":"Task success is not enough: Investigating the use of video-language models as behavior critics for catching undesirable agent behaviors","volume-title":"Proc. Conf. Lang. Model.","author":"Guan","year":"2024"},{"key":"ref4","article-title":"Eureka: Human-level reward design via coding large language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yecheng","year":"2024"},{"key":"ref5","article-title":"Reward design with language models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kwon","year":"2023"},{"key":"ref6","first-page":"13584","article-title":"Language instructed reinforcement learning for human-AI coordination","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Hu","year":"2023"},{"key":"ref7","article-title":"Language to rewards for robotic skill synthesis","volume-title":"Proc. CoRL","author":"Yu","year":"2023"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01535"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.672"},{"key":"ref10","article-title":"RL-VLM-F: Reinforcement learning from vision language foundation model feedback","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang","year":"2024"},{"key":"ref11","first-page":"55681","article-title":"RoboCLIP: One demonstration is enough to learn robot policies","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Sontakke","year":"2023"},{"key":"ref12","article-title":"Vision-language models are zero-shot reward models for reinforcement learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Rocamonde","year":"2024"},{"key":"ref13","article-title":"Vision-language models as a source of rewards","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst. Agent Learn. Open-Endedness Workshop","author":"Baumli","year":"2023"},{"key":"ref14","article-title":"Physically grounded vision-language models for robotic manipulation","volume-title":"Proc. IEEE Int. Conf. Robot. Autom.","author":"Gao","year":"2023"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"ref16","article-title":"Robotic control via embodied chain-of-thought reasoning","volume-title":"Proc. CoRL","author":"Micha","year":"2024"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.049"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01488"},{"key":"ref19","article-title":"RT-Trajectory: Robotic task generalization via hindsight trajectory sketches","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Gu","year":"2024"},{"key":"ref20","first-page":"13610","article-title":"TAP-vid: A benchmark for tracking any point in a video","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Doersch","year":"2022"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00923"},{"key":"ref22","article-title":"Track2act: Predicting point tracks from Internet videos enables diverse zero-shot robot manipulation","volume-title":"Proc. Eur. Conf. Comput. Vis.","author":"Bharadhwaj","year":"2024"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10611409"},{"key":"ref24","article-title":"Any-point trajectory modeling for policy learning","volume-title":"Proc. IEEE Int. Conf. Robot. Autom.","author":"Wen","year":"2023"},{"key":"ref25","article-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"ref26","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023"},{"key":"ref27","article-title":"Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-LLMs","author":"Cheng","year":"2024"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref29","article-title":"Chat-Univi: Unified visual representation empowers large language models with image and video understanding","volume-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit.","author":"Peng","year":"2023"},{"key":"ref30","article-title":"Llava-Next: A strong zero-shot video understanding model","author":"Zhang","year":"2024"},{"key":"ref31","article-title":"Robopoint: A vision-language model for spatial affordance prediction for robotics","volume-title":"Proc. CoRL","author":"Yuan","year":"2024"},{"key":"ref32","article-title":"Moka: Open-vocabulary robotic manipulation through mark-based visual prompting","volume-title":"Proc. RSS","author":"Liu","year":"2024"},{"key":"ref33","article-title":"Pivot: Iterative visual prompting elicits actionable knowledge for VLMs","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Nasiriany","year":"2024"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p19-1472"},{"key":"ref36","article-title":"Toward grounded social reasoning","volume-title":"Proc. IEEE Int. Conf. Robot. Autom.","author":"Kwon","year":"2023"},{"key":"ref37","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Liu","year":"2023"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref39","article-title":"Mediapipe: A framework for building perception pipelines","author":"Lugaresi","year":"2019"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref41","article-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90 chatGPT quality","author":"Chiang","year":"2023"},{"key":"ref42","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref43","article-title":"Llava-Next: Improved reasoning, OCR, and world knowledge","author":"Liu","year":"2024"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"}],"container-title":["IEEE Robotics and Automation Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/7083369\/10849592\/10833748.pdf?arnumber=10833748","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,19]],"date-time":"2025-11-19T18:47:35Z","timestamp":1763578055000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10833748\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3]]},"references-count":44,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/lra.2025.3527290","relation":{},"ISSN":["2377-3766","2377-3774"],"issn-type":[{"value":"2377-3766","type":"electronic"},{"value":"2377-3774","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3]]}}}