{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T05:05:08Z","timestamp":1764997508424,"version":"3.28.0"},"reference-count":59,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,10,23]],"date-time":"2022-10-23T00:00:00Z","timestamp":1666483200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,10,23]],"date-time":"2022-10-23T00:00:00Z","timestamp":1666483200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,10,23]]},"DOI":"10.1109\/iros47612.2022.9982241","type":"proceedings-article","created":{"date-parts":[[2022,12,26]],"date-time":"2022-12-26T19:38:15Z","timestamp":1672083495000},"page":"1000-1007","source":"Crossref","is-referenced-by-count":9,"title":["COMPASS: Contrastive Multimodal Pretraining for Autonomous Systems"],"prefix":"10.1109","author":[{"given":"Shuang","family":"Ma","sequence":"first","affiliation":[{"name":"Microsoft Redmond,WA"}]},{"given":"Sai","family":"Vemprala","sequence":"additional","affiliation":[{"name":"Microsoft Redmond,WA"}]},{"given":"Wenshan","family":"Wang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University Pittsburgh,PA"}]},{"given":"Jayesh K.","family":"Gupta","sequence":"additional","affiliation":[{"name":"Microsoft Redmond,WA"}]},{"given":"Yale","family":"Song","sequence":"additional","affiliation":[{"name":"Microsoft Redmond,WA"}]},{"given":"Daniel","family":"McDufft","sequence":"additional","affiliation":[{"name":"Microsoft Redmond,WA"}]},{"given":"Ashish","family":"Kapoor","sequence":"additional","affiliation":[{"name":"Microsoft Redmond,WA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9341801"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.neuro.20.1.303"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.2991\/978-94-6239-133-8_25"},{"key":"ref4","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv preprint"},{"key":"ref5","article-title":"Language models are few-shot learners","author":"Brown","year":"2020","journal-title":"arXiv preprint"},{"key":"ref6","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"International conference on machine learning","author":"Chen"},{"key":"ref7","article-title":"Bootstrap your own latent: A new approach to self-supervised learning","author":"Grill","year":"2020","journal-title":"arXiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00769"},{"key":"ref9","article-title":"Learning video representations using contrastive bidirectional transformer","author":"Sun","year":"2019","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"ref11","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv preprint"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793994"},{"key":"ref13","article-title":"Learning physical graph representations from visual scenes","author":"Bear","year":"2020","journal-title":"arXiv preprint"},{"key":"ref14","article-title":"Object-centric learning with slot attention","author":"Locatello","year":"2020","journal-title":"arXiv preprint"},{"key":"ref15","article-title":"Curl: Contrastive unsupervised representations for reinforcement learning","author":"Srinivas","year":"2020","journal-title":"arXiv preprint"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.170"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/BFb0056301"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2018.07.002"},{"key":"ref22","article-title":"Watching the world go by: Representation learning from unlabeled videos","author":"Gordon","year":"2020","journal-title":"arXiv preprint"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.29007\/9qvn"},{"key":"ref24","article-title":"Self-supervised multimodal versatile networks","author":"Alayrac","year":"2020","journal-title":"arXiv preprint"},{"key":"ref25","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"Lu","year":"2019","journal-title":"arXiv preprint"},{"key":"ref26","article-title":"Vinv1: Making visual representations matter in vision-language models","author":"Zhang","year":"2021","journal-title":"arXiv preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref28","article-title":"Omninet: A unified architecture for multi-modal multi-task learning","author":"Pramanik","year":"2019","journal-title":"arXiv preprint"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.54"},{"key":"ref30","article-title":"Embodied multimodal multitask learning","author":"Chaplot","year":"2019","journal-title":"arXiv preprint"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2738401"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2020.2987728"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2916887"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8461196"},{"key":"ref36","article-title":"Dense object nets: Learning dense visual object descriptors by and for robotic manipulation","author":"Florence","year":"2018","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793485"},{"key":"ref38","article-title":"Object goal navigation using goal-oriented semantic exploration","volume":"33","author":"Chaplot","year":"2020","journal-title":"Advances in Neural Information Processing Systems."},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2019.XV.047"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/IROS45743.2020.9341049"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00164"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2018.2869640"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1177\/0278364913491297"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00685"},{"key":"ref45","article-title":"Delving deeper into convolutional networks for learning video representations","author":"Ballas","year":"2015","journal-title":"arXiv preprint"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.5220\/0008120604140421"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2015.2463671"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_50"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00136"},{"journal-title":"Tartanvo: A generalizable learning-based vo","year":"2020","author":"Wang","key":"ref50"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00931"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2469274"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989236"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/IROS40897.2019.8968515"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8461251"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00212"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00304"},{"key":"ref58","article-title":"Airsim drone racing lab","author":"Madaan","year":"2020","journal-title":"arXiv preprint"},{"key":"ref59","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020","journal-title":"arXiv preprint"}],"event":{"name":"2022 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","start":{"date-parts":[[2022,10,23]]},"location":"Kyoto, Japan","end":{"date-parts":[[2022,10,27]]}},"container-title":["2022 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9981026\/9981028\/09982241.pdf?arnumber=9982241","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,1]],"date-time":"2024-02-01T05:09:24Z","timestamp":1706764164000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9982241\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,23]]},"references-count":59,"URL":"https:\/\/doi.org\/10.1109\/iros47612.2022.9982241","relation":{},"subject":[],"published":{"date-parts":[[2022,10,23]]}}}