{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T03:36:42Z","timestamp":1773805002507,"version":"3.50.1"},"reference-count":53,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100018696","name":"Health","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100018696","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iros60139.2025.11246228","type":"proceedings-article","created":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T18:54:45Z","timestamp":1764269685000},"page":"20532-20539","source":"Crossref","is-referenced-by-count":1,"title":["RH20T-P: A Primitive-Level Robotic Manipulation Dataset towards Composable Generalization Agents in Real-world Scenarios"],"prefix":"10.1109","author":[{"given":"Zeren","family":"Chen","sequence":"first","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhelun","family":"Shi","sequence":"additional","affiliation":[{"name":"School of Software, Beihang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoya","family":"Lu","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lehan","family":"He","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sucheng","family":"Qian","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Enshen","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Software, Beihang University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenfei","family":"Yin","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wanli","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jing","family":"Shao","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cewu","family":"Lu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lu","family":"Sheng","sequence":"additional","affiliation":[{"name":"School of Software, Beihang University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.025"},{"key":"ref2","article-title":"RT-2: Visionlanguage-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023"},{"key":"ref3","article-title":"RT-Trajectory: Robotic task generalization via hindsight trajectory sketches","author":"Gu","year":"2023"},{"key":"ref4","first-page":"894","article-title":"Cliport: What and where pathways for robotic manipulation","volume-title":"conference on Robot Learning","author":"Shridhar"},{"key":"ref5","article-title":"Learning Orbitally Stable Systems for Diagrammatic Teaching","volume-title":"CoRL 2023 Workshop on Learning Effective Abstractions for Planning (LEAP)","author":"Zhi"},{"key":"ref6","article-title":"Video prediction models as rewards for reinforcement learning","author":"Escontrela","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref7","first-page":"1348","article-title":"Actionquantized offline reinforcement learning for robotic skill learning","volume-title":"Conference on Robot Learning (CoRL)","author":"Luo"},{"key":"ref8","article-title":"Modem: Accelerating visual model-based reinforcement learning with demonstrations","author":"Hansen","year":"2022"},{"key":"ref9","article-title":"Language reward modulation for pretraining reinforcement learning","author":"Adeniji","year":"2023"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.deelio-1.10"},{"key":"ref11","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref12","article-title":"GPT-4 technical report","year":"2023"},{"key":"ref13","article-title":"GPT-4V(ision) System Card","year":"2023"},{"key":"ref14","article-title":"Llama: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref15","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref16","article-title":"PaLM-E: An embodied multimodal language model","author":"Driess","year":"2023"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3590784\/mm1"},{"key":"ref18","article-title":"Octavius: Mitigating task interference in mllms via moe","volume-title":"International Conference on Learning Representations","author":"Chen"},{"key":"ref19","article-title":"From GPT-4 to Gemini and Beyond: Assessing the Landscape of MLLMs on Generalizability, Trustworthiness and Causality through Four Modalities","author":"Lu","year":"2024"},{"key":"ref20","article-title":"Assessment of Multimodal Large Language Models in Alignment with Human Values","author":"Shi","year":"2024"},{"key":"ref21","article-title":"MineDreamer: Learning to Follow Instructions via Chain-of-Imagination for Simulated-World Control","author":"Zhou","year":"2024"},{"key":"ref22","article-title":"WorldSimBench: Towards Video Generation Models as World Simulators","author":"Qin","year":"2024"},{"key":"ref23","first-page":"26650","article-title":"LAMM: Language-Assisted Multi-Modal Instruction-Tuning Dataset, Framework, and Benchmark","volume":"36","author":"Yin","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref24","article-title":"3D-LLM: Injecting the 3d world into large language models","volume":"36","author":"Hong","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref25","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022"},{"key":"ref26","article-title":"PIVOT: Iterative Visual Prompting Elicits Actionable Knowledge for VLMs","author":"Nasiriany","year":"2024"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01710"},{"key":"ref28","article-title":"Look before you leap: Unveiling the power of gpt-4v in robotic vision-language planning","author":"Hu","year":"2023"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/lra.2024.3477090"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.016"},{"key":"ref31","article-title":"Rh20t: A robotic dataset for learning diverse skills in one-shot","author":"Fang","year":"2023"},{"key":"ref32","first-page":"879","article-title":"Roboturk: A crowdsourcing platform for robotic skill learning through imitation","volume-title":"Conference on Robot Learning (CoRL)","author":"Mandlekar"},{"key":"ref33","article-title":"Robonet: Large-scale multi-robot learning","author":"Dasari","year":"2019"},{"key":"ref34","first-page":"906","article-title":"Multiple interactions made easy (mime): Large scale demonstrations data for imitation","volume-title":"Conference on Robot Learning (CoRL)","author":"Sharma"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3295255"},{"key":"ref36","first-page":"991","article-title":"BC-Z: Zero-shot task generalization with robotic imitation learning","volume-title":"conference on Robot Learning (CoRL)","author":"Jang"},{"key":"ref37","first-page":"1303","article-title":"Learning language-conditioned robot behavior from offline data and crowdsourced annotation","volume-title":"Conference on Robot Learning","author":"Nair"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2022.xviii.063"},{"key":"ref39","article-title":"Open xembodiment: Robotic learning datasets and rt-x models","volume-title":"Proceedings of the IEEE International Conference on Robotics and Automation (ICRA)","author":"Padalkar"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2021.xvii.047"},{"key":"ref41","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning (ICML)","author":"Radford"},{"key":"ref42","article-title":"Deformable detr: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2020"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.00165"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00649"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01509"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02214-4"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1405.0312"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01543"},{"key":"ref50","article-title":"Learning universal policies via text-guided video generation","volume":"36","author":"Du","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.120"},{"key":"ref52","article-title":"Compositional Foundation Models for Hierarchical Planning","author":"Ajay","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01146"}],"event":{"name":"2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","location":"Hangzhou, China","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11245651\/11245652\/11246228.pdf?arnumber=11246228","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T06:10:21Z","timestamp":1765519821000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11246228\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":53,"URL":"https:\/\/doi.org\/10.1109\/iros60139.2025.11246228","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}