{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:27:34Z","timestamp":1766068054059,"version":"3.28.0"},"reference-count":74,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1109\/icra57147.2024.10610131","type":"proceedings-article","created":{"date-parts":[[2024,8,8]],"date-time":"2024-08-08T17:51:05Z","timestamp":1723139465000},"page":"17544-17552","source":"Crossref","is-referenced-by-count":4,"title":["Recasting Generic Pretrained Vision Transformers As Object-Centric Scene Encoders For Manipulation Policies"],"prefix":"10.1109","author":[{"given":"Jianing","family":"Qian","sequence":"first","affiliation":[{"name":"University of Pennsylvania,GRASP Lab,Computer and Information Science Department,USA"}]},{"given":"Anastasios","family":"Panagopoulos","sequence":"additional","affiliation":[{"name":"University of Pennsylvania,GRASP Lab,Computer and Information Science Department,USA"}]},{"given":"Dinesh","family":"Jayaraman","sequence":"additional","affiliation":[{"name":"University of Pennsylvania,GRASP Lab,Computer and Information Science Department,USA"}]}],"member":"263","reference":[{"article-title":"Overfeat: Integrated recognition, localization and detection using convolutional networks","year":"2013","author":"Sermanet","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref4","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"International Conference on Machine Learning","author":"Chen"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"article-title":"Bootstrap your own latent: A new approach to self-supervised learning","year":"2020","author":"Grill","key":"ref6"},{"article-title":"Improved baselines with momentum contrastive learning","year":"2020","author":"Chen","key":"ref7"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","year":"2020","author":"Dosovitskiy","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"article-title":"Dinov2: Learning robust visual features without supervision","year":"2023","author":"Oquab","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01553"},{"article-title":"Deep vit features as dense visual descriptors","year":"2021","author":"Amir","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_36"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2018.8593986"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9197331"},{"article-title":"R3m: A universal visual representation for robot manipulation","year":"2022","author":"Nair","key":"ref16"},{"article-title":"Vip: Towards universal visual reward and representation via value-implicit pre-training","year":"2022","author":"Ma","key":"ref17"},{"article-title":"Liv: Language-image representations and rewards for robotic control","year":"2023","author":"Ma","key":"ref18"},{"key":"ref19","first-page":"416","article-title":"Real-world robot learning with masked visual pre-training","volume-title":"Conference on Robot Learning","author":"Radosavovic"},{"article-title":"Where are we in the search for an artificial visual cortex for embodied intelligence?","volume-title":"Workshop on Reincarnating Reinforcement Learning at ICLR 2023","author":"Majumdar","key":"ref20"},{"article-title":"Attention interpretability across nlp tasks","year":"2019","author":"Vashishth","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.385"},{"key":"ref23","first-page":"5221","article-title":"Generalization and robustness implications in object-centric learning","volume-title":"Proceedings of the 39th International Conference on Machine Learning","volume":"162","author":"Dittadi"},{"article-title":"An investigation into pre-training object-centric representations for reinforcement learning","year":"2023","author":"Yoon","key":"ref24"},{"article-title":"On the binding problem in artificial neural networks","year":"2020","author":"Greff","key":"ref25"},{"article-title":"A perspective on objects and systematic generalization in model-based rl","year":"2019","author":"van Steenkiste","key":"ref26"},{"key":"ref27","first-page":"148","article-title":"Sornet: Spatial object-centric representations for sequential manipulation","volume-title":"Conference on Robot Learning","author":"Yuan"},{"article-title":"Vima: General robot manipulation with multimodal prompts","year":"2022","author":"Jiang","key":"ref28"},{"key":"ref29","first-page":"1199","article-title":"Viola: Object-centric imitation learning for vision-based robot manipulation","volume-title":"Conference on Robot Learning","author":"Zhu"},{"key":"ref30","first-page":"18 973","article-title":"Ego4d: Around the world in 3,000 hours of egocentric video","volume-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Xu"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_44"},{"article-title":"For pre-trained vision models in motor control, not all policy learning methods are created equal","year":"2023","author":"Hu","key":"ref32"},{"article-title":"The colosseum: A benchmark for evaluating generalization for robotic manipulation","year":"2023","author":"Pumacay","key":"ref33"},{"article-title":"Reconstruction bottlenecks in Object-Centric generative models","year":"2020","author":"Engelcke","key":"ref34"},{"article-title":"Inductive biases for object-centric representations in the presence of complex textures","volume-title":"UAI 2022 Workshop on Causal Representation Learning","author":"Papa","key":"ref35"},{"key":"ref36","article-title":"Promising or elusive? unsupervised object segmentation from real-world single images","author":"Yang","year":"2022","journal-title":"NeurIPS"},{"article-title":"Neural block-slot representations","year":"2022","author":"Singh","key":"ref37"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00328"},{"article-title":"Invariant slot attention: Object discovery with slot-centric reference frames","year":"2023","author":"Biza","key":"ref39"},{"key":"ref40","first-page":"3992","article-title":"Segment anything","volume-title":"2023 IEEE\/CVF International Conference on Computer Vision (ICCV)","author":"Kirillov"},{"article-title":"Simple open-vocabulary object detection with vision transformers","year":"2022","author":"Minderer","key":"ref41"},{"author":"Stone","key":"ref42","article-title":"Open-world object manipulation using pre-trained vision-language models"},{"article-title":"Focus: Object-centric world models for robotics manipulation","year":"2023","author":"Ferraro","key":"ref43"},{"article-title":"Learning generalizable manipulation policies with object-centric 3d representations","volume-title":"Conference on Robot Learning","author":"Zhu","key":"ref44"},{"key":"ref45","article-title":"Plug-and-play object-centric representations from \u201dwhat\" and \"where\" foundation models","volume-title":"ICRA","author":"Shi","year":"2024"},{"key":"ref46","article-title":"Object discovery and representation networks","author":"Koppula","year":"2022","journal-title":"ECCV"},{"article-title":"Localizing objects with self-supervised transformers and no labels","volume-title":"British Machine Vision Conference","author":"Sim\u00e9oni","key":"ref47"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01414"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00403"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00305"},{"article-title":"Discovering object masks with transformers for unsupervised semantic segmentation","year":"2022","author":"Van Gansbeke","key":"ref51"},{"article-title":"Ibot: Image bert pre-training with online tokenizer","year":"2021","author":"Zhou","key":"ref52"},{"article-title":"Training data-efficient image transformers distillation through attention","year":"2020","author":"Touvron","key":"ref53"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00084"},{"article-title":"Visualizing and understanding patch interactions in vision transformer","year":"2022","author":"Ma","key":"ref55"},{"volume-title":"Exploring explainability in vision transformers","author":"Gil","key":"ref56"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/s11222-007-9033-z"},{"article-title":"Efficient inference in fully connected crfs with gaussian edge potentials","year":"2011","author":"Kr\u00e4henb\u00fchl","key":"ref58"},{"key":"ref59","first-page":"28 016","article-title":"Transformers generalize deepsets and can be extended to graphs & hypergraphs","volume":"34","author":"Kim","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref60","article-title":"The hungarian method for the assignment problem","volume-title":"Naval Research Logistics (NRL)","volume":"52","author":"Kuhn","year":"1955"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_43"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2974707"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_31"},{"key":"ref64","first-page":"10 347","article-title":"Training data-efficient image transformers amp; distillation through attention","volume-title":"International Conference on Machine Learning","volume":"139","author":"Touvron"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"journal-title":"","key":"ref67"},{"key":"ref68","article-title":"Genesis-v2: Inferring unordered object representations without iterative refinement","author":"Engelcke","year":"2021","journal-title":"Neural Information Processing Systems"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.2307\/2284239"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/BF01908075"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2010.161"},{"article-title":"Genesis: Generative scene inference and sampling with object-centric latent representations","year":"2019","author":"Engelcke","key":"ref72"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"article-title":"Meta-world: A benchmark and evaluation for multi-task and meta reinforcement learning","year":"2019","author":"Yu","key":"ref74"}],"event":{"name":"2024 IEEE International Conference on Robotics and Automation (ICRA)","start":{"date-parts":[[2024,5,13]]},"location":"Yokohama, Japan","end":{"date-parts":[[2024,5,17]]}},"container-title":["2024 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10609961\/10609862\/10610131.pdf?arnumber=10610131","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,10]],"date-time":"2024-08-10T05:18:13Z","timestamp":1723267093000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10610131\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":74,"URL":"https:\/\/doi.org\/10.1109\/icra57147.2024.10610131","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]}}}