{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:16:49Z","timestamp":1759331809535,"version":"3.28.0"},"reference-count":18,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,7,8]],"date-time":"2023-07-08T00:00:00Z","timestamp":1688774400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,7,8]],"date-time":"2023-07-08T00:00:00Z","timestamp":1688774400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,7,8]]},"DOI":"10.1109\/icarm58088.2023.10218865","type":"proceedings-article","created":{"date-parts":[[2023,8,25]],"date-time":"2023-08-25T17:17:20Z","timestamp":1692983840000},"page":"203-208","source":"Crossref","is-referenced-by-count":3,"title":["Object-Centric Inference for Language Conditioned Placement: A Foundation Model based Approach"],"prefix":"10.1109","author":[{"given":"Zhixuan","family":"Xu","sequence":"first","affiliation":[{"name":"Zhejiang University,Hangzhou,China"}]},{"given":"Kechun","family":"Xu","sequence":"additional","affiliation":[{"name":"Zhejiang University,Hangzhou,China"}]},{"given":"Rong","family":"Xiong","sequence":"additional","affiliation":[{"name":"Zhejiang University,Hangzhou,China"}]},{"given":"Yue","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University,Hangzhou,China"}]}],"member":"263","reference":[{"key":"ref13","article-title":"Socratic models: Composing zero-shot multimodal reasoning with language","author":"zeng","year":"2022","journal-title":"ar Xiv preprint"},{"key":"ref12","article-title":"Clip-adapter: Better vision-language models with feature adapters","author":"gao","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref15","article-title":"Minedojo: Building open-ended embodied agents with internet-scale knowledge","author":"fan","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref14","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"ahn","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161041"},{"key":"ref10","first-page":"1877","article-title":"Language mod-els are few-shot learners","volume":"33","author":"brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref2","first-page":"726","article-title":"Transporter networks: Rearranging the visual world for robotic manipulation","author":"zeng","year":"0","journal-title":"Conference on Robot Learning"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561391"},{"key":"ref17","doi-asserted-by":"crossref","first-page":"99","DOI":"10.1145\/3503250","article-title":"N erf: Representing scenes as neural radiance fields for view synthesis","volume":"65","author":"mildenhall","year":"2021","journal-title":"Communications of the ACM"},{"journal-title":"Pybullet a python module for physics simulation for games robotics and machine learning","year":"2016","author":"coumans","key":"ref16"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9811809"},{"key":"ref8","first-page":"69","article-title":"Modeling context in referring expressions","author":"yu","year":"0","journal-title":"Computer Vision-ECCV 2016 14th European Conference Amsterdam The Netherlands October 11&#x2013;14 2016 Proceedings Part II 14"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/HUMANOIDS47582.2021.9555802"},{"key":"ref9","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"0","journal-title":"International Conference on Machine Learning"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-71151-1_43"},{"key":"ref3","first-page":"94","article-title":"Learning object placements for relational instructions by hallucinating scene repre-sentations","author":"mees","year":"0","journal-title":"2020 IEEE International Conference on Robotics and Automation (ICRA)"},{"key":"ref6","first-page":"894","article-title":"Cliport: What and where pathways for robotic manipulation","author":"shridhar","year":"0","journal-title":"Conference on Robot Learning"},{"key":"ref5","article-title":"Differentiable parsing and visual grounding of verbal instructions for object placement","author":"zhao","year":"2022","journal-title":"ar Xiv preprint"}],"event":{"name":"2023 International Conference on Advanced Robotics and Mechatronics (ICARM)","start":{"date-parts":[[2023,7,8]]},"location":"Sanya, China","end":{"date-parts":[[2023,7,10]]}},"container-title":["2023 International Conference on Advanced Robotics and Mechatronics (ICARM)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10218726\/10218397\/10218865.pdf?arnumber=10218865","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,18]],"date-time":"2023-09-18T17:40:35Z","timestamp":1695058835000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10218865\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,8]]},"references-count":18,"URL":"https:\/\/doi.org\/10.1109\/icarm58088.2023.10218865","relation":{},"subject":[],"published":{"date-parts":[[2023,7,8]]}}}