{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:30:20Z","timestamp":1766068220200,"version":"3.44.0"},"reference-count":51,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000181","name":"AFOSR","doi-asserted-by":"publisher","award":["FA9550-22-1-0273"],"award-info":[{"award-number":["FA9550-22-1-0273"]}],"id":[{"id":"10.13039\/100000181","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["IIS-2246811"],"award-info":[{"award-number":["IIS-2246811"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11128156","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"9507-9515","source":"Crossref","is-referenced-by-count":3,"title":["KALIE: Fine-Tuning Vision-Language Models for Open-World Manipulation Without Robot Data"],"prefix":"10.1109","author":[{"given":"Grace","family":"Tang","sequence":"first","affiliation":[{"name":"University of California,Berkeley"}]},{"given":"Swetha","family":"Rajkumar","sequence":"additional","affiliation":[{"name":"University of California,Berkeley"}]},{"given":"Yifei","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of California,Berkeley"}]},{"given":"Homer Rich","family":"Walke","sequence":"additional","affiliation":[{"name":"University of California,Berkeley"}]},{"given":"Sergey","family":"Levine","sequence":"additional","affiliation":[{"name":"University of California,Berkeley"}]},{"given":"Kuan","family":"Fang","sequence":"additional","affiliation":[{"name":"Cornell University,USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Solving rubik\u2019s cube with a robot hand","author":"Akkaya","year":"2019","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.025"},{"journal-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","year":"2023","author":"Brohan","key":"ref3"},{"journal-title":"Fireact: Toward language agent fine-tuning","year":"2023","author":"Chen","key":"ref4"},{"journal-title":"Shikra: Unleashing multimodal llm\u2019s referential dialogue magic","year":"2023","author":"Chen","key":"ref5"},{"journal-title":"Vision-language models provide promptable representations for reinforcement learning","year":"2024","author":"Chen","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.010"},{"volume-title":"Open x-embodiment: Robotic learning datasets and rt-x models","year":"2024","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_42"},{"key":"ref10","article-title":"Palm-e: An embodied multimodal language model","volume-title":"International Conference on Machine Learning","author":"Driess","year":"2023"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.062"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01354"},{"key":"ref13","article-title":"Look before you leap: Unveiling the power of gpt4 v in robotic vision-language planning","author":"Hu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref14","article-title":"Voxposer: Composable 3 d value maps for robotic manipulation with language models","author":"Huang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2023.3272516"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00465"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2024.xx.120"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.00821"},{"journal-title":"Adam: A method for stochastic optimization","year":"2017","author":"Kingma","key":"ref19"},{"key":"ref20","doi-asserted-by":"crossref","first-page":"3992","DOI":"10.1109\/ICCV51070.2023.00371","article-title":"Segment anything","volume-title":"2023 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Kirillov","year":"2023"},{"volume-title":"Image augmentation is all you need: Regularizing deep reinforcement learning from pixels","year":"2020","author":"Kostrikov","key":"ref21"},{"key":"ref22","first-page":"19884","article-title":"Reinforcement learning with augmented data","volume":"33","author":"Laskin","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1177\/0278364917710318"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02064"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref27","article-title":"kpam: Keypoint affordances for category-level robotic manipulation","volume-title":"International Symposium of Robotics Research","author":"Manuelli","year":"2019"},{"key":"ref28","first-page":"1162","article-title":"Active domain randomization","volume-title":"Conference on Robot Learning","author":"Mehta","year":"2020"},{"volume-title":"Octo: An open-source generalist robot policy","year":"2023","author":"Model Team","key":"ref29"},{"journal-title":"Gpt-4 technical report","year":"2024","key":"ref30"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2016.7487517"},{"key":"ref32","first-page":"2549","article-title":"General in-hand object rotation with vision and touch","volume-title":"Conference on Robot Learning","author":"Qi","year":"2023"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref34","article-title":"Lm-nav: Robotic navigation with large pretrained models of language, vision, and action","volume-title":"Conference on Robot Learning","author":"Shah","year":"2022"},{"key":"ref35","first-page":"894","article-title":"Cliport: What and where pathways for robotic manipulation","volume-title":"Conference on Robot Learning","author":"Shridhar","year":"2022"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00147"},{"journal-title":"Gemini: A family of highly capable multimodal models","year":"2024","key":"ref37"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2017.8202133"},{"journal-title":"Bridgedata v2: A dataset for robot learning at scale","year":"2024","author":"Walke","key":"ref39"},{"journal-title":"One-peace: Exploring one general representation model toward unlimited modalities","year":"2023","author":"Wang","key":"ref40"},{"journal-title":"Cogvlm: Visual expert for pretrained language models","year":"2024","author":"Wang","key":"ref41"},{"key":"ref42","article-title":"Robogen: Towards unleashing infinite data for automated robot learning via generative simulation","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00366"},{"key":"ref44","article-title":"Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v","author":"Yang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref45","article-title":"Learning interactive real-world simulators","author":"Yang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.027"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.181"},{"journal-title":"Fine-tuning large vision-language models as decision-making agents via reinforcement learning","year":"2024","author":"Zhai","key":"ref48"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01001"},{"journal-title":"Minigpt-4: Enhancing visionlanguage understanding with advanced large language models","year":"2023","author":"Zhu","key":"ref51"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","start":{"date-parts":[[2025,5,19]]},"location":"Atlanta, GA, USA","end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11128156.pdf?arnumber=11128156","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:17:29Z","timestamp":1756880249000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11128156\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":51,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11128156","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}