{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T20:13:41Z","timestamp":1778184821077,"version":"3.51.4"},"reference-count":70,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100018537","name":"National Science and Technology Major Project-Mobile Information Networks","doi-asserted-by":"publisher","award":["2024ZD1300700"],"award-info":[{"award-number":["2024ZD1300700"]}],"id":[{"id":"10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1109\/tcsvt.2026.3651752","type":"journal-article","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T22:04:02Z","timestamp":1768255442000},"page":"6456-6471","source":"Crossref","is-referenced-by-count":0,"title":["SceneReasoner: Sufficient Embodied Scene Understanding From Limited Perception by Explicit Functional Association"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0229-4543","authenticated-orcid":false,"given":"Xiukun","family":"Liu","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3584-8540","authenticated-orcid":false,"given":"Ning","family":"Lan","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5301-3274","authenticated-orcid":false,"given":"Mingjie","family":"Wei","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7857-0845","authenticated-orcid":false,"given":"Xuemei","family":"Xie","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2179-3292","authenticated-orcid":false,"given":"Guangming","family":"Shi","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"129","article-title":"Embodied understanding of driving scenarios","volume-title":"Proc. Eur. Conf. Comput. Vis.","author":"Yun-Song"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2024.3441495"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2771"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2024.3399746"},{"key":"ref5","first-page":"1585","article-title":"Embodied semantic scene graph generation","volume-title":"Proc. Conf. Robot Learn.","author":"Li"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.cell.2024.11.022"},{"key":"ref7","first-page":"3874","article-title":"Associative memories via predictive coding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Salvatori"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0128840"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.psych.50.1.243"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-naacl.205"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161534"},{"key":"ref12","article-title":"Embodied task planning with large language models","author":"Wu","year":"2023","journal-title":"arXiv:2307.01848"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.073"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3324380"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3401451"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3538860"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3485907"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3531410"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610243"},{"key":"ref20","article-title":"ReMEmbR: Building and reasoning over long-horizon spatio-temporal memory for robot navigation","author":"Anwar","year":"2024","journal-title":"arXiv:2409.13682"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00008"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01560"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02091"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3277206"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3215564"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3091581"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2948267"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.findings-emnlp.700"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531854"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401090"},{"key":"ref31","article-title":"Knowledge-aware dual-side attribute-enhanced recommendation","author":"Pang","year":"2024","journal-title":"arXiv:2403.16037"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3460231.3474240"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref34","first-page":"5776","article-title":"MiniLM: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref35","volume-title":"Hugging Face Model Card","year":"2021"},{"key":"ref36","article-title":"DeepSeek-r1: Incentivizing reasoning capability in LLMs via reinforcement learning","author":"Guo","year":"2025","journal-title":"arXiv:2501.12948"},{"key":"ref37","article-title":"The replica dataset: A digital replica of indoor spaces","author":"Straub","year":"2019","journal-title":"arXiv:1906.05797"},{"key":"ref38","article-title":"Habitat synthetic scenes dataset (HSSD-200): An analysis of 3D scene scale and realism tradeoffs for ObjectGoal navigation","author":"Khanna","year":"2023","journal-title":"arXiv:2306.11290"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00943"},{"key":"ref40","first-page":"251","article-title":"Habitat 2.0: Training home assistants to rearrange their habitat","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"34","author":"Szot"},{"key":"ref41","article-title":"Habitat 3.0: A co-habitat for humans, avatars and robots","author":"Puig","year":"2023","journal-title":"arXiv:2310.13724"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73414-4_27"},{"key":"ref43","article-title":"Compile scene graphs with reinforcement learning","author":"Chen","year":"2025","journal-title":"arXiv:2504.13617"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02287"},{"key":"ref45","article-title":"From data to modeling: Fully open-vocabulary scene graph generation","author":"Chen","year":"2025","journal-title":"arXiv:2505.20106"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_11"},{"key":"ref47","article-title":"Retrieval-augmented generation for large language models: A survey","author":"Gao","year":"2023","journal-title":"arXiv:2312.10997"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i25.34855"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00280"},{"key":"ref50","article-title":"Fast and accurate task planning using neuro-symbolic language models and multi-level goal decomposition","author":"Kwon","year":"2024","journal-title":"arXiv:2409.19250"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4224"},{"key":"ref52","article-title":"Youtu-GraphRAG: Vertically unified agents for graph retrieval-augmented complex reasoning","author":"Dong","year":"2025","journal-title":"arXiv:2508. 19855"},{"key":"ref53","article-title":"LLMs-as-judges: A comprehensive survey on LLM-based evaluation methods","author":"Li","year":"2024","journal-title":"arXiv:2412.05579"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3640457.3688075"},{"key":"ref55","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv:2312.11805"},{"key":"ref56","article-title":"GLM-4.5: Agentic, reasoning, and coding (ARC) foundation models","author":"Zeng","year":"2025","journal-title":"arXiv:2508.06471"},{"key":"ref57","article-title":"Kimi k2: Open agentic intelligence","author":"Team","year":"2025","journal-title":"arXiv:2507.20534"},{"key":"ref58","volume-title":"Opencompass: A Universal Evaluation Platform for Foundation Models","year":"2023"},{"key":"ref59","article-title":"Qwen2-VL: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024","journal-title":"arXiv:2409.12191"},{"key":"ref60","article-title":"Qwen-VL: A versatile vision-language model for understanding, localization, text reading, and beyond","author":"Bai","year":"2023","journal-title":"arXiv:2308.12966"},{"key":"ref61","article-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling","author":"Chen","year":"2024","journal-title":"arXiv:2412.05271"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref64","article-title":"DeepSeek-V3 technical report","volume-title":"arXiv:2412.19437","author":"Liu","year":"2024"},{"key":"ref65","article-title":"Qwen2.5 technical report","volume-title":"arXiv:2412.15115","author":"Yang","year":"2024"},{"key":"ref66","article-title":"Qwen2 technical report","volume-title":"arXiv:2407.10671","author":"Yang","year":"2024"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1016\/j.xjidi.2022.100133"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-024-58937-4"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3712286"},{"key":"ref70","article-title":"Nav-r1: Reasoning and navigation in embodied scenes","author":"Liu","year":"2025","journal-title":"arXiv:2509.10884"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/76\/11511351\/11339518.pdf?arnumber=11339518","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T19:55:58Z","timestamp":1778183758000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11339518\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":70,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2026.3651752","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5]]}}}