{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T23:22:17Z","timestamp":1780356137422,"version":"3.54.1"},"reference-count":31,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,8,28]],"date-time":"2024-08-28T00:00:00Z","timestamp":1724803200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,8,28]],"date-time":"2024-08-28T00:00:00Z","timestamp":1724803200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100004359","name":"Vetenskapsr\u00e5det","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004359","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,8,28]]},"DOI":"10.1109\/case59546.2024.10711845","type":"proceedings-article","created":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T17:40:16Z","timestamp":1729705216000},"page":"21-26","source":"Crossref","is-referenced-by-count":23,"title":["Vision-language model-driven scene understanding and robotic object manipulation"],"prefix":"10.1109","author":[{"given":"Sichao","family":"Liu","sequence":"first","affiliation":[{"name":"KTH,Department of Production Engineering,Sweden"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianjing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Case Western Reserve University,Department of Mechanical and Aerospace Engineering,USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Robert X.","family":"Gao","sequence":"additional","affiliation":[{"name":"Case Western Reserve University,Department of Mechanical and Aerospace Engineering,USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xi","family":"Vincent Wang","sequence":"additional","affiliation":[{"name":"KTH,Department of Production Engineering,Sweden"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lihui","family":"Wang","sequence":"additional","affiliation":[{"name":"KTH,Department of Production Engineering,Sweden"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1115\/1.4050187"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.cirp.2024.03.004"},{"key":"ref3","first-page":"13139","article-title":"Language-conditioned imitation learning for robot manipulation tasks","volume":"33","author":"Stepputtis","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.3389\/frai.2023.1199350"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.cirp.2023.04.013"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-81-322-3972-7_19"},{"key":"ref7","author":"Devlin","year":"2018","journal-title":"Bert: Pre-training of deep bidirectional transformers for language understanding"},{"key":"ref8","author":"Roumeliotis","year":"2023","journal-title":"Llama 2: Early adopters\u2019 utilization of meta\u2019s new open-source pretrained model"},{"key":"ref9","author":"Achiam","year":"2023","journal-title":"Gpt-4 technical report"},{"key":"ref10","author":"Banks","year":"2024","journal-title":"Gemma: Introducing new state-of-the-art open models"},{"key":"ref11","author":"Wei","year":"2022","journal-title":"Emergent abilities of large language models"},{"issue":"2","key":"ref12","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3605943","article-title":"Recent advances in natural language processing via large pre-trained language models: A survey","volume":"56","author":"Min","year":"2023","journal-title":"ACM Computing Surveys"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161317"},{"key":"ref14","author":"Driess","year":"2023","journal-title":"Palm-e: An embodied multimodal language model"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/j.aei.2024.102371"},{"key":"ref16","first-page":"287","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","volume-title":"Conference on Robot Learning","author":"Brohan"},{"key":"ref17","author":"Jiang","year":"2022","journal-title":"Vima: General robot manipulation with multimodal prompts"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160969"},{"key":"ref19","author":"Chen","year":"2022","journal-title":"Leveraging large language models for robot 3d scene understanding"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.350"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2011.6094596"},{"key":"ref22","first-page":"310","article-title":"Vision-based holistic scene understanding for context-aware human-robot interaction","volume-title":"International Conference of the Italian Association for Artificial Intelligence","author":"De Magistris"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00494"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161085"},{"key":"ref25","author":"Chen","year":"2023","journal-title":"Minigpt-v2: large language model as a unified interface for vision-language multi-task learning"},{"key":"ref26","author":"Dosovitskiy","year":"2020","journal-title":"An image is worth 16x16 words: Transformers for image recognition at scale"},{"key":"ref27","author":"Li","year":"2023","journal-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.rcim.2023.102610"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160591"},{"key":"ref30","author":"Huang","year":"2023","journal-title":"Instruct2act: Mapping multi-modality instructions to robotic actions with large language model"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/access.2024.3387941"}],"event":{"name":"2024 IEEE 20th International Conference on Automation Science and Engineering (CASE)","location":"Bari, Italy","start":{"date-parts":[[2024,8,28]]},"end":{"date-parts":[[2024,9,1]]}},"container-title":["2024 IEEE 20th International Conference on Automation Science and Engineering (CASE)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10711304\/10711288\/10711845.pdf?arnumber=10711845","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T01:20:57Z","timestamp":1732670457000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10711845\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,28]]},"references-count":31,"URL":"https:\/\/doi.org\/10.1109\/case59546.2024.10711845","relation":{},"subject":[],"published":{"date-parts":[[2024,8,28]]}}}