{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,5]],"date-time":"2025-12-05T12:30:56Z","timestamp":1764937856314,"version":"3.37.3"},"reference-count":36,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"JSPS KAKENHI","award":["23K28168"],"award-info":[{"award-number":["23K28168"]}]},{"name":"JST Moonshot, in part by NEDO"},{"name":"JSPS Fellows","award":["JP23KJ1917"],"award-info":[{"award-number":["JP23KJ1917"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Robot. Autom. Lett."],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1109\/lra.2025.3539086","type":"journal-article","created":{"date-parts":[[2025,2,5]],"date-time":"2025-02-05T19:16:58Z","timestamp":1738783018000},"page":"3022-3029","source":"Crossref","is-referenced-by-count":1,"title":["Mobile Manipulation Instruction Generation From Multiple Images With Automatic Metric Enhancement"],"prefix":"10.1109","volume":"10","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-7078-457X","authenticated-orcid":false,"given":"Kei","family":"Katsumata","sequence":"first","affiliation":[{"name":"Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1991-9119","authenticated-orcid":false,"given":"Motonari","family":"Kambara","sequence":"additional","affiliation":[{"name":"Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2087-2038","authenticated-orcid":false,"given":"Daichi","family":"Yashima","sequence":"additional","affiliation":[{"name":"Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0354-9070","authenticated-orcid":false,"given":"Ryosuke","family":"Korekata","sequence":"additional","affiliation":[{"name":"Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0261-0510","authenticated-orcid":false,"given":"Komei","family":"Sugiura","sequence":"additional","affiliation":[{"name":"Keio University, Yokohama, Kanagawa, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01000"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_10"},{"key":"ref3","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"202","author":"Li","year":"2023"},{"key":"ref4","article-title":"Foundation models in robotics: Applications, challenges, and the future","volume-title":"Challenges Future","author":"Firoozi","year":"2023"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1080\/01691864.2024.2408593"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.artint.2015.08.002"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1080\/01691864.2019.1663608"},{"key":"ref8","first-page":"8469","article-title":"PaLM-E: An embodied multimodal language model","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Driess","year":"2023"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3352363"},{"key":"ref10","article-title":"DM2RM: Dual-mode multimodal ranking for target objects and receptacles based on open-vocabulary instructions","volume-title":"AR","author":"Korekata","year":"2024"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2017.00081"},{"key":"ref12","article-title":"Habitat-Matterport 3D dataset (HM3D): 1000 Large-scale 3D environments for embodied AI","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ramakrishnan","year":"2021"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00544"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3607827.3616839"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00543"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01103"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01303"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.39"},{"key":"ref22","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"article-title":"Set-of-mark prompting unleashes extraordinary visual grounding in GPT-4V","year":"2023","author":"Yang","key":"ref23"},{"key":"ref24","first-page":"1","article-title":"DINOv2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Trans. Mach. Learn. Res. J."},{"key":"ref25","first-page":"34892","article-title":"Visual instruction tuning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Liu","year":"2023"},{"key":"ref26","article-title":"Task success prediction for open-vocabulary manipulation based on multi-level aligned representations","volume-title":"Proc. Annu. Conf. Robot Learn.","author":"Goko","year":"2024"},{"article-title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","year":"2024","author":"Reid","key":"ref27"},{"article-title":"GPT-4 Technical Report","year":"2023","author":"Achiam","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01287"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00668"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1907.11692"},{"key":"ref33","first-page":"19769","article-title":"Segment everything everywhere all at once","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zou","year":"2023"},{"year":"2024","key":"ref34","article-title":"text-embedding-3-large"},{"article-title":"OPT: Open pre-trained transformer language models","year":"2022","author":"Zhang","key":"ref35"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"}],"container-title":["IEEE Robotics and Automation Letters"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/7083369\/10849592\/10873846.pdf?arnumber=10873846","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,20]],"date-time":"2025-02-20T20:39:22Z","timestamp":1740083962000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10873846\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3]]},"references-count":36,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/lra.2025.3539086","relation":{},"ISSN":["2377-3766","2377-3774"],"issn-type":[{"type":"electronic","value":"2377-3766"},{"type":"electronic","value":"2377-3774"}],"subject":[],"published":{"date-parts":[[2025,3]]}}}