{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:33:26Z","timestamp":1780356806638,"version":"3.54.1"},"reference-count":18,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,12,12]],"date-time":"2023-12-12T00:00:00Z","timestamp":1702339200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,12]],"date-time":"2023-12-12T00:00:00Z","timestamp":1702339200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,12,12]]},"DOI":"10.1109\/humanoids57100.2023.10375211","type":"proceedings-article","created":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T14:32:51Z","timestamp":1704119571000},"page":"1-8","source":"Crossref","is-referenced-by-count":15,"title":["Robotic Applications of Pre-Trained Vision-Language Models to Various Recognition Behaviors"],"prefix":"10.1109","author":[{"given":"Kento","family":"Kawaharazuka","sequence":"first","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo,Department of Mechano- Informatics,Bunkyo-ku,Tokyo,Japan,113\u20138656"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yoshiki","family":"Obinata","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo,Department of Mechano- Informatics,Bunkyo-ku,Tokyo,Japan,113\u20138656"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Naoaki","family":"Kanazawa","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo,Department of Mechano- Informatics,Bunkyo-ku,Tokyo,Japan,113\u20138656"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kei","family":"Okada","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo,Department of Mechano- Informatics,Bunkyo-ku,Tokyo,Japan,113\u20138656"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Masayuki","family":"Inaba","sequence":"additional","affiliation":[{"name":"Graduate School of Information Science and Technology, The University of Tokyo,Department of Mechano- Informatics,Bunkyo-ku,Tokyo,Japan,113\u20138656"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref2","first-page":"2672","article-title":"Generative Adversarial Nets","volume-title":"Proceedings of the 2014 Neural Information Processing Systems","author":"Goodfellow"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref4","article-title":"Learning Transferable Visual Models From Natural Language Super-vision","author":"Radford","year":"2021","journal-title":"arXiv preprint"},{"key":"ref5","article-title":"OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Frame-work","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00008"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8460699"},{"key":"ref8","first-page":"1","article-title":"CLIPort: What and Where Pathways for Robotic Manipulation","volume-title":"Proceedings of the 2021 Conference on Robot Learning","author":"Shridhar"},{"key":"ref9","article-title":"Open X-Embodiment: Robotic Learning Datasets and RT-X Models","volume-title":"Open X-Embodiment Collaboration","year":"2023"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2018.2801475"},{"key":"ref11","article-title":"Vision-Language Intelligence: Tasks, Representation Learning, and Large Models","author":"Li","year":"2022","journal-title":"arXiv preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref13","first-page":"1","article-title":"Open-vocabulary Object Detection via Vision and Language Knowledge Distillation","volume-title":"Proceedings of the 10th International Conference on Learning Representations, 2022","author":"Gu"},{"key":"ref14","first-page":"1","article-title":"Language-driven Semantic Segmentation","volume-title":"Proceedings of the 10th International Conference on Learning Representations","author":"Li"},{"key":"ref15","article-title":"Language Models are Few-Shot Learners","author":"Brown","year":"2020","journal-title":"arXiv preprint"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2015.7139369"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1080\/01691864.2022.2114297"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160390"}],"event":{"name":"2023 IEEE-RAS 22nd International Conference on Humanoid Robots (Humanoids)","location":"Austin, TX, USA","start":{"date-parts":[[2023,12,12]]},"end":{"date-parts":[[2023,12,14]]}},"container-title":["2023 IEEE-RAS 22nd International Conference on Humanoid Robots (Humanoids)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10375141\/10374561\/10375211.pdf?arnumber=10375211","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,12]],"date-time":"2024-01-12T20:07:47Z","timestamp":1705090067000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10375211\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,12]]},"references-count":18,"URL":"https:\/\/doi.org\/10.1109\/humanoids57100.2023.10375211","relation":{},"subject":[],"published":{"date-parts":[[2023,12,12]]}}}