{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T16:37:23Z","timestamp":1757608643798,"version":"3.44.0"},"reference-count":44,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100004063","name":"Knut and Alice Wallenberg Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100004063","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11127332","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"3654-3661","source":"Crossref","is-referenced-by-count":0,"title":["Feature Extractor or Decision Maker: Rethinking the Role of Visual Encoders in Visuomotor Policies"],"prefix":"10.1109","author":[{"given":"Ruiyu","family":"Wang","sequence":"first","affiliation":[{"name":"KTH Royal Institute of Technology,Division of Robotics, Perception and Learning,Sweden"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zheyu","family":"Zhuang","sequence":"additional","affiliation":[{"name":"KTH Royal Institute of Technology,Division of Robotics, Perception and Learning,Sweden"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shutong","family":"Jin","sequence":"additional","affiliation":[{"name":"KTH Royal Institute of Technology,Division of Robotics, Perception and Learning,Sweden"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nils","family":"Ingelhag","sequence":"additional","affiliation":[{"name":"KTH Royal Institute of Technology,Division of Robotics, Perception and Learning,Sweden"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Danica","family":"Kragic","sequence":"additional","affiliation":[{"name":"KTH Royal Institute of Technology,Division of Robotics, Perception and Learning,Sweden"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Florian T.","family":"Pokorny","sequence":"additional","affiliation":[{"name":"KTH Royal Institute of Technology,Division of Robotics, Perception and Learning,Sweden"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"End-toend training of deep visuomotor policies","author":"Levine","year":"2016","journal-title":"Journal of Machine Learning Research (JMLR)"},{"key":"ref2","article-title":"Scalable deep reinforcement learning for vision-based robotic manipulation","volume-title":"Conference on Robot Learning (CoRL)","author":"Kalashnikov","year":"2018"},{"key":"ref3","article-title":"Transporter networks: Rearranging the visual world for robotic manipulation","volume-title":"Conference on Robot Learning (CoRL)","author":"Zeng","year":"2021"},{"key":"ref4","article-title":"Perceiveractor: A multi-task transformer for robotic manipulation","volume-title":"Conference on Robot Learning (CoRL)","author":"Shridhar","year":"2023"},{"key":"ref5","article-title":"What matters in learning from offline human demonstrations for robot manipulation","volume-title":"Conference on Robot Learning (CoRL)","author":"Mandlekar","year":"2021"},{"key":"ref6","article-title":"Mimicgen: A data generation system for scalable robot learning using human demonstrations","volume-title":"Conference on Robot Learning (CoRL)","author":"Mandlekar","year":"2023"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_12"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref9","article-title":"R3m: A universal visual representation for robot manipulation","volume-title":"Conference on Robot Learning (CoRL)","author":"Nair","year":"2022"},{"key":"ref10","article-title":"Masked visual pre-training for motor control","author":"Xiao","year":"2022","journal-title":"arXiv"},{"key":"ref11","article-title":"Vip: Towards universal visual reward and representation via value-implicit pre-training","volume-title":"International Conference on Learning Representations (ICLR)","author":"Ma","year":"2023"},{"key":"ref12","article-title":"The (un)surprising effectiveness of pre-trained vision models for control","volume-title":"International Conference on Machine Learning (ICML)","author":"Parisi","year":"2022"},{"key":"ref13","article-title":"Real-world robot learning with masked visual pre-training","volume-title":"Conference on Robot Learning (CoRL)","author":"Radosavovic","year":"2022"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref16","article-title":"Alvinn: An autonomous land vehicle in a neural network","volume-title":"Conference on Advances in Neural Information Processing Systems (NeurIPS)","author":"Pomerleau","year":"1988"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2023.XIX.026"},{"key":"ref18","article-title":"Full-gradient representation for neural network visualization","volume-title":"Conference on Advances in Neural Information Processing Systems (NeurIPS)","author":"Srinivas","year":"2019"},{"volume-title":"Libero: Benchmarking knowledge transfer for lifelong robot learning","year":"2023","author":"Liu","key":"ref19"},{"key":"ref20","article-title":"On pre-training for visuo-motor control: Revisiting a learning-fromscratch baseline","volume-title":"International Conference on Machine Learning (ICML)","author":"Hansen","year":"2023"},{"key":"ref21","article-title":"Lossless adaptation of pretrained vision models for robotic manipulation","volume-title":"International Conference on Learning Representations (ICLR)","author":"Sharma","year":"2023"},{"key":"ref22","article-title":"Spawnnet: Learning generalizable visuomotor skills from pre-trained networks","author":"Lin","year":"2023","journal-title":"arXiv"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9197331"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1405.0312"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01441"},{"key":"ref26","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning (ICML)","author":"Radford","year":"2021"},{"key":"ref27","article-title":"VIOLA: Object-centric imitation learning for visionbased robot manipulation","volume-title":"Conference on Robot Learning (CoRL)","author":"Zhu","year":"2022"},{"key":"ref28","article-title":"Robot learning with sensorimotor pretraining","volume-title":"Conference on Robot Learning (CoRL)","author":"Radosavovic","year":"2023"},{"key":"ref29","article-title":"Masked contrastive representation learning","author":"Yao","year":"2022","journal-title":"Transactions on Pattern Analysis and Machine Intelligence (TPAMI)"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161336"},{"key":"ref31","article-title":"Policy pre-training for autonomous driving via selfsupervised geometric modeling","volume-title":"International Conference on Learning Representations (ICLR)","author":"Wu","year":"2023"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.123"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.032"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/IROS55552.2023.10342201"},{"key":"ref36","article-title":"An unbiased look at datasets for visuo-motor pretraining","volume-title":"Conference on Robot Learning (CoRL)","author":"Dasari","year":"2023"},{"key":"ref37","article-title":"For pre-trained vision models in motor control, not all policy learning methods are created equal","volume-title":"International Conference on Machine Learning (ICML)","author":"Hu","year":"2023"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913495721"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3054912"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref41","article-title":"Implicit behavioral cloning","volume-title":"Conference on Robot Learning (CoRL)","author":"Florence","year":"2021"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"ref43","article-title":"Quest2ros: An app to facilitate teleoperating robots","volume-title":"International Workshop on Virtual, Augmented, and Mixed-Reality for Human-Robot Interactions","author":"Welle","year":"2024"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.3269"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","start":{"date-parts":[[2025,5,19]]},"location":"Atlanta, GA, USA","end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11127332.pdf?arnumber=11127332","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:04:31Z","timestamp":1756879471000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11127332\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":44,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11127332","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}