{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,21]],"date-time":"2026-06-21T03:22:44Z","timestamp":1782012164908,"version":"3.54.5"},"reference-count":60,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"am","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T00:00:00Z","timestamp":1740787200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&#x0026;D Program of China","award":["2022ZD0160301"],"award-info":[{"award-number":["2022ZD0160301"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376031"],"award-info":[{"award-number":["62376031"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"CCF-Tencent Rhino-Bird Open Research Fund"},{"name":"NSF","award":["IIS-1943641"],"award-info":[{"award-number":["IIS-1943641"]}]},{"name":"NSF","award":["IIS-1956441"],"award-info":[{"award-number":["IIS-1956441"]}]},{"name":"NSF","award":["CCF-1837129"],"award-info":[{"award-number":["CCF-1837129"]}]},{"name":"SRA from Meta"},{"name":"research gift from Amazon Alexa AI"},{"name":"gift from Relational AI"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1109\/tpami.2024.3511593","type":"journal-article","created":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T19:05:03Z","timestamp":1733425503000},"page":"1894-1907","source":"Crossref","is-referenced-by-count":25,"title":["<b>JARVIS<\/b>-1: Open-World Multi-Task Agents With Memory-Augmented Multimodal Language Models"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8396-3707","authenticated-orcid":false,"given":"Zihao","family":"Wang","sequence":"first","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1195-7276","authenticated-orcid":false,"given":"Shaofei","family":"Cai","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Anji","family":"Liu","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yonggang","family":"Jin","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jinbing","family":"Hou","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9310-6347","authenticated-orcid":false,"given":"Bowei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Haowei","family":"Lin","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3433-8435","authenticated-orcid":false,"given":"Zhaofeng","family":"He","sequence":"additional","affiliation":[{"name":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zilong","family":"Zheng","sequence":"additional","affiliation":[{"name":"Beijing Institute for General Artificial Intelligence (BIGAI), Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8132-5613","authenticated-orcid":false,"given":"Yaodong","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5609-3822","authenticated-orcid":false,"given":"Xiaojian","family":"Ma","sequence":"additional","affiliation":[{"name":"Beijing Institute for General Artificial Intelligence (BIGAI), Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0944-7814","authenticated-orcid":false,"given":"Yitao","family":"Liang","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","first-page":"10","article-title":"State abstractions for lifelong reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Abel"},{"key":"ref2","first-page":"20","article-title":"Policy and value transfer in lifelong reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Abel"},{"key":"ref3","article-title":"Flamingo: A visual language model for few-shot learning","author":"Alayrac","year":"2022"},{"key":"ref4","article-title":"Video pretraining (VPT): Learning to act by watching unlabeled online videos","author":"Baker","year":"2022"},{"key":"ref5","article-title":"FUYU-8B: Introducing our multimodal models","author":"Bavishi","year":"2023"},{"key":"ref6","article-title":"RT-2: Vision-language-action models transfer web knowledge to robotic control","author":"Brohan","year":"2023"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.15607\/rss.2023.xix.025"},{"key":"ref8","article-title":"Do as I can, not as I say: Grounding language in robotic affordances","volume-title":"Proc. 6th Annu. Conf. Robot Learn.","author":"Brohan"},{"key":"ref9","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Brown"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01320"},{"key":"ref11","article-title":"Groot: Learning to follow instructions by watching gameplay videos","author":"Cai","year":"2023"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_22"},{"key":"ref13","article-title":"Teaching large language models to self-debug","author":"Chen","year":"2023"},{"key":"ref14","article-title":"Collaborating with language models for embodied reasoning","volume-title":"Proc. NeurIPS Found. Models Decis. Mak. Workshop","author":"Dasgupta"},{"key":"ref15","article-title":"Clip4mc: An RL-friendly vision-language model for minecraft","author":"Ding","year":"2023"},{"key":"ref16","article-title":"Building open-ended embodied agents with internet-scale knowledge","volume-title":"Proc. Adv. Neural Inf. Process. Syst. Datasets Benchmarks","author":"Fan"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-naacl.200"},{"key":"ref18","article-title":"The mineRL 2020 competition on sample efficient reinforcement learning using human priors","author":"Guss","year":"2021"},{"key":"ref19","article-title":"Neurips 2019 competition: The minerl competition on sample efficient reinforcement learning using human priors","author":"Guss","year":"2019"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/339"},{"key":"ref21","article-title":"An embodied generalist agent in 3D world","author":"Huang","year":"2023"},{"key":"ref22","first-page":"9118","article-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Huang"},{"key":"ref23","article-title":"Inner monologue: Embodied reasoning through planning with language models","author":"Huang","year":"2022"},{"key":"ref24","article-title":"Minerl diamond 2021 competition: Overview, results, and lessons learned","volume-title":"Proc. Neural Inf. Process. Syst.","author":"Kanervisto"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.695"},{"key":"ref26","article-title":"Continual pre-training of language models","volume-title":"Proc. 11th Int. Conf. Learn. Representations","author":"Ke"},{"key":"ref27","first-page":"19274","article-title":"Fast inference from transformers via speculative decoding","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Leviathan"},{"key":"ref28","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive NLP tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lewis"},{"key":"ref29","article-title":"Code as policies: Language model programs for embodied control","author":"Liang","year":"2022"},{"key":"ref30","article-title":"Steve-1: A generative model for text-to-behavior in minecraft","author":"Lifshitz","year":"2023"},{"key":"ref31","article-title":"MCU: A task-centric framework for open-ended agent evaluation in minecraft","author":"Lin","year":"2023"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-023-10131-7"},{"key":"ref33","article-title":"Juewu-MC: Playing minecraft with sample-efficient hierarchical reinforcement learning","author":"Lin","year":"2021"},{"key":"ref34","article-title":"LLM+P: Empowering large language models with optimal planning proficiency","author":"Liu","year":"2023"},{"key":"ref35","article-title":"Visual instruction tuning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Liu"},{"key":"ref36","article-title":"LLM as a robotic brain: Unifying egocentric memory and control","author":"Mai","year":"2023"},{"key":"ref37","first-page":"38","article-title":"SEIHAI: A sample-efficient hierarchical ai for the minerl competition","volume-title":"Proc. Distrib. Artif. Intell.: 3rd Int. Conf.","author":"Mao"},{"key":"ref38","article-title":"Generation-augmented retrieval for open-domain question answering","author":"Mao","year":"2020"},{"key":"ref39","first-page":"2661","article-title":"Zero-shot task generalization with multi-task deep reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Oh"},{"key":"ref40","article-title":"Gpt-4 technical report","year":"2023"},{"key":"ref41","first-page":"27730","article-title":"Training language models to follow instructions with human feedback","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Ouyang","year":"2022"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606763"},{"key":"ref43","article-title":"A generalist agent","author":"Reed","year":"2022"},{"key":"ref44","article-title":"Reflexion: An autonomous agent with dynamic memory and self-reflection","author":"Shinn","year":"2023"},{"key":"ref45","article-title":"ProgPrompt: Generating situated robot task plans using large language models","author":"Singh","year":"2022"},{"key":"ref46","article-title":"Adaplanner: Adaptive planning from feedback with language models","author":"Sun","year":"2023"},{"key":"ref47","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref48","article-title":"Voyager: An open-ended embodied agent with large language models","author":"Wang","year":"2023"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.754"},{"key":"ref50","article-title":"Describe, explain, plan and select: Interactive planning with large language models enables open-world multi-task agents","author":"Wang","year":"2023"},{"key":"ref51","first-page":"24824","article-title":"Chain of thought prompting elicits reasoning in large language models","volume-title":"Proc. 36th Conf. Neural Inf. Process. Syst.","author":"Wei"},{"key":"ref52","article-title":"SPRING: GPT-4 out-performs RL algorithms by studying papers and reasoning","author":"Wu","year":"2023"},{"key":"ref53","article-title":"Tree of thoughts: Deliberate problem solving with large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Yao","year":"2024"},{"key":"ref54","article-title":"React: Synergizing reasoning and acting in language models","author":"Yao","year":"2022"},{"key":"ref55","article-title":"PLAN4MC: Skill reinforcement learning and planning for open-world minecraft tasks","author":"Yuan","year":"2023"},{"key":"ref56","article-title":"Socratic models: Composing zero-shot multimodal reasoning with language","author":"Zeng","year":"2022"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29710"},{"key":"ref58","article-title":"Rladapter: Bridging large language models to reinforcement learning in open worlds","author":"Zhang","year":"2023"},{"key":"ref59","article-title":"MMICL: Empowering vision-language model with multi-modal in-context learning","author":"Zhao","year":"2023"},{"key":"ref60","article-title":"Ghost in the minecraft: Generally capable agents for open-world enviroments via large language models with text-based knowledge and memory","author":"Zhu","year":"2023"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/ieeexplore.ieee.org\/ielam\/34\/10873290\/10778628-aam.pdf","content-type":"application\/pdf","content-version":"am","intended-application":"syndication"},{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/10873290\/10778628.pdf?arnumber=10778628","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T19:41:59Z","timestamp":1743795719000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10778628\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3]]},"references-count":60,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2024.3511593","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3]]}}}