{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T14:13:23Z","timestamp":1773065603561,"version":"3.50.1"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iros60139.2025.11246658","type":"proceedings-article","created":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T18:54:45Z","timestamp":1764269685000},"page":"21566-21573","source":"Crossref","is-referenced-by-count":1,"title":["ET-Plan-Bench: Embodied Task-level Planning Benchmark Towards Spatial-Temporal Cognition with Foundation Models"],"prefix":"10.1109","author":[{"given":"Lingfeng","family":"Zhang","sequence":"first","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Yuening","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Hongjian","family":"Gu","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Atia","family":"Hamidizadeh","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Zhanguang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Yuecheng","family":"Liu","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Yutong","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"David Gamaliel","family":"Arcos Bravo","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Junyi","family":"Dong","sequence":"additional","affiliation":[{"name":"Huawei Cloud"}]},{"given":"Shunbo","family":"Zhou","sequence":"additional","affiliation":[{"name":"Huawei Cloud"}]},{"given":"Tongtong","family":"Cao","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Xingyue","family":"Quan","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Yuzheng","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Yingxue","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]},{"given":"Jianye","family":"Hao","sequence":"additional","affiliation":[{"name":"Huawei Noah&#x2019;s Ark Lab"}]}],"member":"263","reference":[{"key":"ref1","first-page":"3343","article-title":"Egotaskqa: Understanding human tasks in egocentric videos","volume":"35","author":"Jia","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.418"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00886"},{"key":"ref5","article-title":"Watch-and-help: A challenge for social perception and human-ai collaboration","author":"Puig","year":"2020"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01075"},{"key":"ref7","first-page":"80","article-title":"Behavior-1k: A benchmark for embodied ai with 1,000 everyday activities and realistic simulation","volume-title":"Conference on Robot Learning","author":"Li"},{"key":"ref8","first-page":"12 014","article-title":"Handmethat: Human-robot communication in physical and social environments","volume":"35","author":"Wan","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref9","article-title":"Lota-bench: Benchmarking language-oriented task planners for embodied agents","author":"Choi","year":"2024"},{"key":"ref10","article-title":"Robogen: Towards unleashing infinite data for automated robot learning via generative simulation","author":"Wang","year":"2023"},{"key":"ref11","first-page":"477","article-title":"Behavior: Bench-mark for everyday household activities in virtual, interactive, and ecological environments","volume-title":"Conference on robot learning","author":"Srivastava"},{"key":"ref12","article-title":"Mini-behavior: A procedurally generated bench-mark for long-horizon decision-making in embodied ai","author":"Jin","year":"2023"},{"key":"ref13","article-title":"Embodiedgpt: Vision-language pre-training via embodied chain of thought","volume":"36","author":"Mu","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref14","article-title":"Egoplan-bench: Benchmarking egocentric embodied planning with multimodal large language models","author":"Chen","year":"2023"},{"key":"ref15","first-page":"100 428","article-title":"Embodied agent interface: Bench-marking llms for embodied decision making","volume":"37","author":"Li","year":"2025","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00536"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01355"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01560"},{"key":"ref19","article-title":"Noisyeqa: Benchmarking embodied question answering against noisy queries","author":"Wu","year":"2024"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02290"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-long.579"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-emnlp.822"},{"key":"ref23","article-title":"Egoplan-bench2: A benchmark for multimodal large language model planning in real-world scenarios","author":"Qiu","year":"2024"},{"key":"ref24","article-title":"Embodiedeval: Evaluate multimodal llms as embodied agents","author":"Cheng","year":"2025"},{"key":"ref25","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref26","article-title":"Mixtral of experts","author":"Jiang","year":"2024"},{"key":"ref27","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref28","article-title":"Do as i can, not as i say: Grounding language in robotic affordances","author":"Ahn","year":"2022"},{"key":"ref29","first-page":"9118","article-title":"Language models as zero-shot planners: Extracting actionable knowledge for embodied agents","volume-title":"International conference on machine learning","author":"Huang"},{"key":"ref30","article-title":"Large language models as common-sense knowledge for large-scale task planning","volume":"36","author":"Zhao","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref31","article-title":"On the planning abilities of large language models-a critical investigation","volume":"36","author":"Valmeekam","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref32","article-title":"Faith and fate: Limits of transformers on compositionality","volume":"36","author":"Dziri","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref33","article-title":"Ai2-thor: An interactive 3d environment for visual ai","author":"Kolve","year":"2017"},{"key":"ref34","article-title":"Alfworld: Aligning text and embodied environments for interactive learning","author":"Shridhar","year":"2020"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01550"},{"key":"ref36","article-title":"Gpt-4 technical report","author":"Achiam","year":"2023"},{"key":"ref37","first-page":"251","article-title":"Habitat 2.0: Training home assistants to rearrange their habitat","volume":"34","author":"Szot","year":"2021","journal-title":"Advances in neural information processing systems"}],"event":{"name":"2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","location":"Hangzhou, China","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11245651\/11245652\/11246658.pdf?arnumber=11246658","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T12:36:50Z","timestamp":1766061410000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11246658\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/iros60139.2025.11246658","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}