{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T08:17:06Z","timestamp":1781079426413,"version":"3.54.1"},"reference-count":47,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1109\/icra57147.2024.10611112","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T17:51:05Z","timestamp":1723139465000},"page":"2051-2058","source":"Crossref","is-referenced-by-count":36,"title":["Vision-Language Interpreter for Robot Task Planning"],"prefix":"10.1109","author":[{"given":"Keisuke","family":"Shirai","sequence":"first","affiliation":[{"name":"Kyoto University,Kyoto,Japan,606-8501"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Cristian C.","family":"Beltran-Hernandez","sequence":"additional","affiliation":[{"name":"OMRON SINIC X Corporation,Tokyo,Japan,113-0033"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Masashi","family":"Hamaya","sequence":"additional","affiliation":[{"name":"OMRON SINIC X Corporation,Tokyo,Japan,113-0033"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Atsushi","family":"Hashimoto","sequence":"additional","affiliation":[{"name":"OMRON SINIC X Corporation,Tokyo,Japan,113-0033"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shohei","family":"Tanaka","sequence":"additional","affiliation":[{"name":"OMRON SINIC X Corporation,Tokyo,Japan,113-0033"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kento","family":"Kawaharazuka","sequence":"additional","affiliation":[{"name":"University of Tokyo,Tokyo,Japan,113-8656"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kazutoshi","family":"Tanaka","sequence":"additional","affiliation":[{"name":"OMRON SINIC X Corporation,Tokyo,Japan,113-0033"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yoshitaka","family":"Ushiku","sequence":"additional","affiliation":[{"name":"OMRON SINIC X Corporation,Tokyo,Japan,113-0033"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shinsuke","family":"Mori","sequence":"additional","affiliation":[{"name":"University of Tokyo,Tokyo,Japan,113-8656"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8460699"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1146\/annurev-control-101119-071628"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1406.1078"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2017.XIII.056"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8794441"},{"key":"ref8","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref9","article-title":"GPT-4 technical report","year":"2023"},{"key":"ref10","article-title":"PaLM 2 technical report","author":"Anil","year":"2023"},{"key":"ref11","first-page":"9118","article-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents","volume-title":"ser. Proceedings of the 2022 International Conference on Machine Learning (ICML)","volume":"162","author":"Huang"},{"key":"ref12","article-title":"Planning with large language models via corrective re-prompting","volume-title":"NeurIPS 2022 Foundation Models for Decision Making Workshop","author":"Raman"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-023-10131-7"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"ref15","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proceedings of the 2020 Advances in Neural Information Processing Systems","volume":"33","author":"Brown"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/dsaa.2018.00018"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1146\/annurev-control-082619-100135"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-01584-7"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1129"},{"key":"ref20","article-title":"Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection","author":"Liu","year":"2023"},{"key":"ref21","first-page":"19 730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proceedings of the 2023 International Conference on Machine Learning (ICML)","volume":"202","author":"Li"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1705"},{"key":"ref23","article-title":"Do as I can, not as I say: Grounding language in robotic affordances","author":"Ahn","year":"2022"},{"key":"ref24","article-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023"},{"key":"ref25","article-title":"LLM+P: Empowering large language models with optimal planning proficiency","author":"Liu","year":"2023"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.120"},{"key":"ref27","article-title":"Translating natural language to planning goals with large-language models","author":"Xie","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/S0004-3702(01)00108-4"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/icaps.v30i1.6739"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2015.7139728"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1177\/02783649211004615"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/IROS51168.2021.9635941"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2577031"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_51"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_40"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.330"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_11"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9812016"},{"key":"ref41","first-page":"24 824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proceedings of the 2022 Advances in Neural Information Processing Systems (NeurIPS)","volume":"35","author":"Wei"},{"key":"ref42","first-page":"22 199","article-title":"Large language models are zero-shot reasoners","volume-title":"Proceedings of the 2022 Advances in Neural Information Processing Systems (NeurIPS)","volume":"35","author":"Kojima"},{"key":"ref43","article-title":"Least-to-most prompting enables complex reasoning in large language models","volume-title":"Proceedings of the 2023 International Conference on Learning Representations (ICLR)","author":"Zhou"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1016\/0004-3702(92)90028-V"},{"key":"ref45","first-page":"1629","article-title":"Translating HTNs to PDDL: A small amount of domain knowledge can go a long way","volume-title":"Proceedings of the 2009 International Joint Conference on Artificial Intelligence (IJCAI)","volume":"9","author":"Alford"},{"key":"ref46","article-title":"PDDLGym: Gym environments from PDDL problems","volume-title":"Proceedings of the 2020 International Conference on Automated Planning and Scheduling (ICAPS) PRL Workshop","author":"Silver"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.173"}],"event":{"name":"2024 IEEE International Conference on Robotics and Automation (ICRA)","location":"Yokohama, Japan","start":{"date-parts":[[2024,5,13]]},"end":{"date-parts":[[2024,5,17]]}},"container-title":["2024 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10609961\/10609862\/10611112.pdf?arnumber=10611112","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,11]],"date-time":"2024-08-11T04:14:27Z","timestamp":1723349667000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10611112\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":47,"URL":"https:\/\/doi.org\/10.1109\/icra57147.2024.10611112","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]}}}