{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:21:14Z","timestamp":1778080874393,"version":"3.51.4"},"reference-count":44,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,5,29]],"date-time":"2023-05-29T00:00:00Z","timestamp":1685318400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,5,29]],"date-time":"2023-05-29T00:00:00Z","timestamp":1685318400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,5,29]]},"DOI":"10.1109\/icra48891.2023.10161068","type":"proceedings-article","created":{"date-parts":[[2023,7,4]],"date-time":"2023-07-04T17:20:56Z","timestamp":1688491256000},"page":"7287-7294","source":"Crossref","is-referenced-by-count":38,"title":["LATTE: LAnguage Trajectory TransformEr"],"prefix":"10.1109","author":[{"given":"Arthur","family":"Bucker","sequence":"first","affiliation":[{"name":"Technische Universit&#x00E4;t,M&#x00FC;nchen"}]},{"given":"Luis","family":"Figueredo","sequence":"additional","affiliation":[{"name":"Technische Universit&#x00E4;t,M&#x00FC;nchen"}]},{"given":"Sami","family":"Haddadin","sequence":"additional","affiliation":[{"name":"Technische Universit&#x00E4;t,M&#x00FC;nchen"}]},{"given":"Ashish","family":"Kapoor","sequence":"additional","affiliation":[{"name":"Microsoft"}]},{"given":"Shuang","family":"Ma","sequence":"additional","affiliation":[{"name":"Microsoft"}]},{"given":"Sai","family":"Vemprala","sequence":"additional","affiliation":[{"name":"Microsoft"}]},{"given":"Rogerio","family":"Bonatti","sequence":"additional","affiliation":[{"name":"Microsoft"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1177\/0278364920917755"},{"key":"ref35","article-title":"Decision transformer: Reinforcement learning via sequence modeling","volume":"34","author":"chen","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1146\/annurev-control-101119-071628"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412190"},{"key":"ref15","article-title":"From language to goals: Inverse reinforcement learning for vision-based instruction following","author":"fu","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-4380-9_35"},{"key":"ref14","article-title":"Language understanding for field and service robots in a priori unknown environments","author":"walter","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref36","article-title":"Offline reinforcement learning as one big sequence modeling problem","volume":"34","author":"janner","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref31","first-page":"394","article-title":"Vision-and-dialog navigation","author":"thomason","year":"0","journal-title":"Conference on Robot Learning"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01315"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/0005-1098(89)90002-2"},{"key":"ref33","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511546877"},{"key":"ref32","author":"nguyen","year":"2019","journal-title":"Help Anna! Visual Navigation with Natural Multimodal Assistance via Retrospective Curiosity-Encouraging Imitation Learning"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1177\/0278364915581193"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS47612.2022.9981810"},{"key":"ref17","first-page":"13 139","article-title":"Language-conditioned imitation learning for robot manipulation tasks","volume":"33","author":"stepputtis","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref39","first-page":"879","article-title":"Roboturk: A crowdsourcing platform for robotic skill learning through imitation","author":"mandlekar","year":"0","journal-title":"Conference on Robot Learning"},{"key":"ref16","article-title":"A recurrent vision-and-language bert for navigation. arxiv 2021","author":"hong","year":"0","journal-title":"ArXiv Preprint"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48506.2021.9560745"},{"key":"ref19","article-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents","author":"huang","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref18","article-title":"Zero-shot task adaptation using natural language","author":"goyal","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref23","article-title":"Habitat 2.0: Training home assistants to rearrange their habitat","volume":"34","author":"szot","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref25","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"0","journal-title":"Int Conference on Machine Learning"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1177\/02783649211046285"},{"key":"ref42","article-title":"Large batch optimization for deep learning: Training bert in 76 minutes","author":"you","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref41","article-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","author":"lewis","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref22","article-title":"On the opportunities and risks of foundation models","author":"bommasani","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref44","article-title":"Yolov3: An incremental improvement","author":"redmon","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref21","article-title":"Semantically grounded object matching for robust robotic scene rearrangement","author":"goodwin","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref43","article-title":"Coppeliasim (formerly v-rep): a versatile and scalable robot simulation framework","author":"rohmer","year":"0","journal-title":"Proc of the International Conference on Intelligent Robots and Systems (IROS)"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref27","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"lu","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref29","article-title":"Vl-bert: Pre-training of generic visual-linguistic representations","author":"su","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref8","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"ahn","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref7","first-page":"894","article-title":"Cliport: What and where pathways for robotic manipulation","author":"shridhar","year":"0","journal-title":"Conference on Robot Learning"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2022.XVIII.065"},{"key":"ref4","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref3","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref6","article-title":"Clip on wheels: Zero-shot object navigation as object localization and exploration","author":"gadre","year":"2022","journal-title":"arXiv Prerint"},{"key":"ref5","article-title":"Using deepspeed and megatron to train megatron-turing nlg 530b, a large-scale generative language model","author":"smith","year":"2022","journal-title":"ArXiv Preprint"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"}],"event":{"name":"2023 IEEE International Conference on Robotics and Automation (ICRA)","location":"London, United Kingdom","start":{"date-parts":[[2023,5,29]]},"end":{"date-parts":[[2023,6,2]]}},"container-title":["2023 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10160211\/10160212\/10161068.pdf?arnumber=10161068","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,24]],"date-time":"2023-07-24T17:37:07Z","timestamp":1690220227000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10161068\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,29]]},"references-count":44,"URL":"https:\/\/doi.org\/10.1109\/icra48891.2023.10161068","relation":{},"subject":[],"published":{"date-parts":[[2023,5,29]]}}}