{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T09:59:18Z","timestamp":1777888758951,"version":"3.51.4"},"reference-count":65,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001666","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62476293,62272494"],"award-info":[{"award-number":["62476293,62272494"]}],"id":[{"id":"10.13039\/501100001666","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01354","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"14590-14601","source":"Crossref","is-referenced-by-count":0,"title":["RoBridge: A Hierarchical Architecture Bridging Cognition and Execution for General Robotic Manipulation"],"prefix":"10.1109","author":[{"given":"Kaidong","family":"Zhang","sequence":"first","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rongtao","family":"Xu","sequence":"additional","affiliation":[{"name":"MBZUAI"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pengzhen","family":"Ren","sequence":"additional","affiliation":[{"name":"Peng Cheng Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junfan","family":"Lin","sequence":"additional","affiliation":[{"name":"Peng Cheng Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hefeng","family":"Wu","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liang","family":"Lin","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaodan","family":"Liang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","author":"Agarwal","year":"2022","journal-title":"Legged locomotion in challenging terrains using egocentric vision"},{"key":"ref2","first-page":"6","author":"Ahn","year":"2022","journal-title":"Do as i can, not as i say: Grounding language in robotic affordances"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref4","author":"Black","year":"2023","journal-title":"Zero-shot robotic manipulation with pretrained imageediting diffusion models"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2025.xxi.010"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.025"},{"key":"ref7","first-page":"1","author":"Brohan","year":"2023","journal-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.025"},{"key":"ref9","author":"Cheang","year":"2024","journal-title":"Gr-2: A generative video-language-action model with web-scale knowledge for robot manipulation"},{"key":"ref10","author":"Cheng","year":"2024","journal-title":"Nod-tamp: Generalizable long-horizon planning with neural object descriptors"},{"key":"ref11","author":"Collaboration","year":"2024","journal-title":"Open x-embodiment: Robotic learning datasets and rt-x models"},{"key":"ref12","first-page":"3","author":"Dalal","year":"2024","journal-title":"Plan-seq-learn: Language model guided rl for solving long horizon robotics tasks"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/icra55743.2025.11128407"},{"key":"ref14","author":"Du","year":"2023","journal-title":"Learning universal policies via text-guided video generation"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2022.3141105"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10611615"},{"key":"ref17","author":"Fu","year":"2024","journal-title":"Mobile aloha: Learning bimanual mobile manipulation with lowcost whole-body teleoperation"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10342471"},{"key":"ref19","author":"Gazzaniga","year":"2006","journal-title":"Cognitive neuroscience. the biology of the mind, (2014)"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1126\/science.3975635"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9561157"},{"key":"ref22","article-title":"Tenenbaum, and Jiajun Wu","author":"Hsu","year":"2023","journal-title":"What\u2019s left? concept grounding with logic-enhanced foundation models"},{"key":"ref23","author":"Hu","year":"2024","journal-title":"Toward general-purpose robots via foundation models: A survey and meta-analysis"},{"key":"ref24","author":"Huang","year":"2023","journal-title":"Voxposer: Composable 3d value maps for robotic manipulation with language models"},{"key":"ref25","first-page":"1","author":"Huang","year":"2024","journal-title":"Rekep: Spatio-temporal reasoning of relational keypoint constraints for robotic manipulation"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72940-9_13"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.120"},{"key":"ref28","first-page":"1","author":"Jin Kim","year":"2024","journal-title":"Openvla: An opensource vision-language-action model"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref30","first-page":"3","author":"Kuang","year":"2024","journal-title":"Ram: Retrieval-based affordance transfer for generalizable zero-shot robotic manipulation"},{"key":"ref31","author":"Li","year":"2024","journal-title":"Towards generalist robot policies: What matters in building vision-language-action models"},{"key":"ref32","first-page":"1","author":"Liu","year":"2024","journal-title":"Moka: Open-world robotic manipulation through markbased visual prompting"},{"key":"ref33","first-page":"1","author":"Liu","year":"2024","journal-title":"Rdt-1b: a diffusion foundation model for bimanual manipulation"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-019-13239-6"},{"key":"ref36","first-page":"1","volume-title":"Octo: An open-source generalist robot policy","author":"Model Team","year":"2023"},{"key":"ref37","first-page":"1","volume-title":"Gpt-4 technical report","year":"2024"},{"key":"ref38","author":"Akkaya","year":"2019","journal-title":"Solving rubik\u2019s cube with a robot hand"},{"key":"ref39","first-page":"3","author":"Oquab","year":"2024","journal-title":"Dinov2: Learning robust visual features without supervision"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01618"},{"key":"ref41","article-title":"Alvinn: An autonomous land vehicle in a neural network","volume":"1","author":"Pomerleau","year":"1988","journal-title":"Advances in neural information processing systems"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.133"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01117"},{"key":"ref44","author":"Ross","year":"2011","journal-title":"A reduction of imitation learning and structured prediction to no-regret online learning"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196619"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2017.8202133"},{"key":"ref47","first-page":"1","author":"Touvron","year":"2023","journal-title":"Llama: Open and efficient foundation language models"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19818-2_6"},{"key":"ref49","article-title":"All robots in one: A new standard and unified dataset for versatile, general-purpose embodied agents","volume-title":"arXiv preprint","author":"Wang","year":"2024"},{"key":"ref50","author":"Wu","year":"2023","journal-title":"Unleashing large-scale video generative pre-training for visual robot manipulation"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.22191\/buuj\/10\/2\/2"},{"key":"ref52","first-page":"3","author":"Yang","year":"2023","journal-title":"Track anything: Segment anything meets videos"},{"key":"ref53","first-page":"3","author":"Yang","year":"2023","journal-title":"Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v"},{"key":"ref54","first-page":"3","author":"Yarats","year":"2021","journal-title":"Mastering visual continuous control: Improved dataaugmented reinforcement learning"},{"key":"ref55","author":"Yu","year":"2021","journal-title":"Meta-world: A benchmark and evaluation for multi-task and meta reinforcement learning"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00752-z"},{"key":"ref57","author":"Yuan","year":"2022","journal-title":"Sornet: Spatial object-centric representations for sequential manipulation"},{"key":"ref58","article-title":"Pivot-r: Primitivedriven waypoint-aware world model for robotic manipulation","author":"Zhang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1714"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8461249"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.016"},{"key":"ref62","author":"Zhou","year":"2025","journal-title":"Dino-wm: World models on pre-trained visual features enable zero-shot planning"},{"key":"ref63","author":"Zhu","year":"2024","journal-title":"Nerf-aug: Data augmentation for robotics with neural radiance fields"},{"key":"ref64","author":"Zhu","year":"2023","journal-title":"Learning generalizable manipulation policies with objectcentric 3d representations"},{"key":"ref65","author":"Zhu","year":"2025","journal-title":"robosuite: A modular simulation framework and benchmark for robot learning"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445861.pdf?arnumber=11445861","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T04:57:26Z","timestamp":1777611446000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445861\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":65,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01354","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}