{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T15:47:57Z","timestamp":1768232877432,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":54,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819557394","type":"print"},{"value":"9789819557400","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5740-0_3","type":"book-chapter","created":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T12:24:58Z","timestamp":1768220698000},"page":"32-45","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ILMA: Improving Robot Task Execution in\u00a0Complex Environments via\u00a0Item-Level Multi-Agent Interaction"],"prefix":"10.1007","author":[{"given":"Kewei","family":"Chen","sequence":"first","affiliation":[]},{"given":"Yayu","family":"Long","sequence":"additional","affiliation":[]},{"given":"Mingsheng","family":"Shang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"3_CR1","unstructured":"Black, K., et al.: Zero-shot robotic manipulation with pre-trained image-editing diffusion models. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"3_CR2","unstructured":"Brohan, A., et\u00a0al.: Rt-1: Robotics transformer for real-world control at scale. arXiv preprint arXiv:2212.06817 (2022)"},{"key":"3_CR3","doi-asserted-by":"crossref","unstructured":"Brunke, L., Zhang, Y., R\u00f6mer, R., Naimer, J., Staykov, N., Zhou, S., Schoellig, A.P.: Semantically safe robot manipulation: from semantic scene understanding to motion safeguards. IEEE Robotics and Automation Letters (2025)","DOI":"10.1109\/LRA.2025.3553046"},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Chen, B., Xu, Z., Kirmani, S., Ichter, B., Sadigh, D., Guibas, L., Xia, F.: Spatialvlm: Endowing vision-language models with spatial reasoning capabilities. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14455\u201314465 (2024)","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Chen, P., et\u00a0al.: Alignbot: Aligning vlm-powered customized task planning with user reminders through fine-tuning for household robots. In: 2025 IEEE International Conference on Robotics and Automation (ICRA), pp. 12549\u201312556. IEEE (2025)","DOI":"10.1109\/ICRA55743.2025.11128775"},{"key":"3_CR6","doi-asserted-by":"publisher","first-page":"135062","DOI":"10.52202\/079017-4293","volume":"37","author":"AC Cheng","year":"2024","unstructured":"Cheng, A.C., Yin, H., Fu, Y., Guo, Q., Yang, R., Kautz, J., Wang, X., Liu, S.: Spatialrgpt: grounded spatial reasoning in vision-language models. Adv. Neural. Inf. Process. Syst. 37, 135062\u2013135093 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"10\u201311","key":"3_CR7","doi-asserted-by":"publisher","first-page":"1684","DOI":"10.1177\/02783649241273668","volume":"44","author":"C Chi","year":"2025","unstructured":"Chi, C., et al.: Diffusion policy: visuomotor policy learning via action diffusion. Int. J. Robot. Res. 44(10\u201311), 1684\u20131704 (2025)","journal-title":"Int. J. Robot. Res."},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Dai, A., Chang, A.X., Savva, M., Halber, M., Funkhouser, T., Nie\u00dfner, M.: Scannet: Richly-annotated 3d reconstructions of indoor scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5828\u20135839 (2017)","DOI":"10.1109\/CVPR.2017.261"},{"key":"3_CR9","unstructured":"Ding, P., et\u00a0al.: Humanoid-vla: towards universal humanoid control with visual integration. arXiv preprint arXiv:2502.14795 (2025)"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Ding, P., et al.: Quar-vla: Vision-language-action model for quadruped robots. In: European Conference on Computer Vision, pp. 352\u2013367. Springer (2024)","DOI":"10.1007\/978-3-031-72652-1_21"},{"key":"3_CR11","unstructured":"Driess, D., et\u00a0al.: Palm-e: An embodied multimodal language model. In: International Conference on Machine Learning, pp. 8469\u20138488. PMLR (2023)"},{"key":"3_CR12","first-page":"9156","volume":"36","author":"Y Du","year":"2023","unstructured":"Du, Y., Yang, S., Dai, B., Dai, H., Nachum, O., Tenenbaum, J., Schuurmans, D., Abbeel, P.: Learning universal policies via text-guided video generation. Adv. Neural. Inf. Process. Syst. 36, 9156\u20139172 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR13","unstructured":"Ehtesham, A., Singh, A., Gupta, G.K., Kumar, S.: A survey of agent interoperability protocols: Model context protocol (mcp), agent communication protocol (acp), agent-to-agent protocol (a2a), and agent network protocol (anp) (2025). https:\/\/arxiv.org\/abs\/2505.02279"},{"key":"3_CR14","unstructured":"Fan, C., et\u00a0al.: Interleave-vla: Enhancing robot manipulation with interleaved image-text instructions. In: 1st Workshop on Safely Leveraging Vision-Language Foundation Models in Robotics: Challenges and Opportunities (2025)"},{"key":"3_CR15","doi-asserted-by":"crossref","unstructured":"Frankish, K., Ramsey, W.M.: The Cambridge handbook of artificial intelligence. Cambridge University Press (2014)","DOI":"10.1017\/CBO9781139046855"},{"key":"3_CR16","unstructured":"Guo, T., et al.: Large language model based multi-agents: a survey of progress and challenges. In: IJCAI (2024)"},{"key":"3_CR17","unstructured":"Hu, S., Shen, L., Zhang, Y., Tao, D.: Learning multi-agent communication from graph modeling perspective. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"3_CR18","unstructured":"Huang, W., Wang, C., Zhang, R., Li, Y., Wu, J., Fei-Fei, L.: Voxposer: composable 3d value maps for robotic manipulation with language models. In: Conference on Robot Learning, pp. 540\u2013562. PMLR (2023)"},{"key":"3_CR19","unstructured":"brian ichter, Brohan, A., et al.: Do as i can, not as i say: Grounding language in robotic affordances. In: 6th Annual Conference on Robot Learning (2022). https:\/\/openreview.net\/forum?id=bdHkMjBJG_w"},{"key":"3_CR20","unstructured":"Jaki\u0107, L., Boti\u010dki, I., Kr\u017ei\u0107, A.: Bridging virtual and physical robotics: an ai-driven educational platform using nvidia omniverse isaac sim and jetson orin nano. In: 2025 MIPRO 48th ICT and Electronics Convention, pp. 2161\u20132166. IEEE (2025)"},{"issue":"19","key":"3_CR21","doi-asserted-by":"publisher","first-page":"8868","DOI":"10.3390\/app14198868","volume":"14","author":"H Jeong","year":"2024","unstructured":"Jeong, H., Lee, H., Kim, C., Shin, S.: A survey of robot intelligence with large language models. Appl. Sci. 14(19), 8868 (2024)","journal-title":"Appl. Sci."},{"key":"3_CR22","unstructured":"Jiang, F., Peng, Y., Dong, L., Wang, K., Yang, K., Pan, C., Niyato, D., Dobre, O.A.: Large language model enhanced multi-agent systems for 6g communications. IEEE Wirel. Commun. (2024)"},{"issue":"5","key":"3_CR23","doi-asserted-by":"publisher","first-page":"121","DOI":"10.3390\/computers13050121","volume":"13","author":"SD Khan","year":"2024","unstructured":"Khan, S.D., Othman, K.M.: Indoor scene classification through dual-stream deep learning: a framework for improved scene understanding in robotics. Computers 13(5), 121 (2024)","journal-title":"Computers"},{"issue":"5","key":"3_CR24","doi-asserted-by":"publisher","first-page":"1091","DOI":"10.1007\/s11370-024-00550-5","volume":"17","author":"Y Kim","year":"2024","unstructured":"Kim, Y., Kim, D., Choi, J., Park, J., Oh, N., Park, D.: A survey on integration of large language models with intelligent robots. Intel. Serv. Robot. 17(5), 1091\u20131107 (2024)","journal-title":"Intel. Serv. Robot."},{"key":"3_CR25","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4015\u20134026 (2023)"},{"key":"3_CR26","unstructured":"Li, X., et\u00a0al.: Vision-language foundation models as effective robot imitators. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"3_CR27","first-page":"44776","volume":"36","author":"B Liu","year":"2023","unstructured":"Liu, B., et al.: Libero: benchmarking knowledge transfer for lifelong robot learning. Adv. Neural. Inf. Process. Syst. 36, 44776\u201344791 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR28","unstructured":"Liu, F., Yan, F., Zheng, L., Feng, C., Huang, Y., Ma, L.: Robouniview: Visual-language model with unified view representation for robotic manipulation. arXiv preprint arXiv:2406.18977 (2024)"},{"key":"3_CR29","unstructured":"Liu, G., Jiang, W., Lei, B., Pandey, V., Daniilidis, K., Motee, N.: Beyond uncertainty: Risk-aware active view acquisition for safe robot navigation and 3d scene understanding with fisherrf. arXiv preprint arXiv:2403.11396 (2024)"},{"key":"3_CR30","first-page":"655","volume":"36","author":"A Majumdar","year":"2023","unstructured":"Majumdar, A., et al.: Where are we in the search for an artificial visual cortex for embodied intelligence? Adv. Neural. Inf. Process. Syst. 36, 655\u2013677 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR31","unstructured":"Makoviychuk, V., et\u00a0al.: Isaac gym: High performance gpu based physics simulation for robot learning. In: Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2) (2021)"},{"key":"3_CR32","unstructured":"Mao, Y., Zhong, J., Fang, C., Zheng, J., Tang, R., Zhu, H., Tan, P., Zhou, Z.: Spatiallm: training large language models for structured indoor modeling. In: Advances in Neural Information Processing Systems (2025)"},{"key":"3_CR33","unstructured":"Masterman, T., Besen, S., Sawtell, M., Chao, A.: The landscape of emerging ai agent architectures for reasoning, planning, and tool calling: a survey. arXiv preprint arXiv:2404.11584 (2024)"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"McCormac, J., Handa, A., Davison, A., Leutenegger, S.: Semanticfusion: Dense 3d semantic mapping with convolutional neural networks. In: 2017 IEEE International Conference on Robotics and automation (ICRA), pp. 4628\u20134635. IEEE (2017)","DOI":"10.1109\/ICRA.2017.7989538"},{"issue":"3","key":"3_CR35","doi-asserted-by":"publisher","first-page":"7327","DOI":"10.1109\/LRA.2022.3180108","volume":"7","author":"O Mees","year":"2022","unstructured":"Mees, O., Hermann, L., Rosete-Beas, E., Burgard, W.: Calvin: a benchmark for language-conditioned policy learning for long-horizon robot manipulation tasks. IEEE Robot. Automation Lett. 7(3), 7327\u20137334 (2022)","journal-title":"IEEE Robot. Automation Lett."},{"key":"3_CR36","doi-asserted-by":"crossref","unstructured":"Mueller, A.: Modern robotics: Mechanics, planning, and control, vol.\u00a039. IEEE (2019)","DOI":"10.1109\/MCS.2019.2937265"},{"key":"3_CR37","unstructured":"O\u2019Neill, A., et\u00a0al.: Open x-embodiment: Robotic learning datasets and rt-x models: Open x-embodiment collaboration 0. In: 2024 IEEE International Conference on Robotics and Automation (ICRA), pp. 6892\u20136903. IEEE (2024)"},{"key":"3_CR38","unstructured":"Ravi, N., et\u00a0al.: Sam 2: Segment anything in images and videos. In: The Thirteenth International Conference on Learning Representations (2025)"},{"key":"3_CR39","unstructured":"Reuss, M., Ya\u011fmurlu, \u00d6.E., Wenzel, F., Lioutikov, R.: Multimodal diffusion transformer: Learning versatile behavior from multimodal goals. In: First Workshop on Vision-Language Models for Navigation and Manipulation at ICRA 2024"},{"key":"3_CR40","unstructured":"Rosinol, A., Abate, M., Chang, Y., Carlone, L.: Kimera: an open-source library for real-time metric-semantic localization and mapping. In: 2020 IEEE International Conference on Robotics and Automation (ICRA), pp. 1689\u20131696. IEEE (2020)"},{"key":"3_CR41","doi-asserted-by":"crossref","unstructured":"Sermanet, P., et\u00a0al.: Robovqa: Multimodal long-horizon reasoning for robotics. In: 2024 IEEE International Conference on Robotics and Automation (ICRA), pp. 645\u2013652. IEEE (2024)","DOI":"10.1109\/ICRA57147.2024.10610216"},{"key":"3_CR42","doi-asserted-by":"crossref","unstructured":"Tang, Y.F., et al.: Mobile robot oriented large-scale indoor dataset for dynamic scene understanding. In: 2024 IEEE International Conference on Robotics and Automation (ICRA), pp. 613\u2013620. IEEE (2024)","DOI":"10.1109\/ICRA57147.2024.10611489"},{"key":"3_CR43","doi-asserted-by":"crossref","unstructured":"Tao, S., et\u00a0al.: Maniskill3: Gpu parallelized robot simulation and rendering for generalizable embodied ai. In: 7th Robot Learning Workshop: Towards Robots with Human-Level Abilities (2025)","DOI":"10.15607\/RSS.2025.XXI.021"},{"issue":"1","key":"3_CR44","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1016\/j.jai.2024.12.003","volume":"4","author":"J Wang","year":"2025","unstructured":"Wang, J., et al.: Large language models for robotics: opportunities, challenges, and perspectives. J. Automation Intell. 4(1), 52\u201364 (2025)","journal-title":"J. Automation Intell."},{"key":"3_CR45","first-page":"75392","volume":"37","author":"J Wang","year":"2024","unstructured":"Wang, J., et al.: Is a picture worth a thousand words? delving into spatial reasoning for vision language models. Adv. Neural. Inf. Process. Syst. 37, 75392\u201375421 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR46","first-page":"41051","volume":"37","author":"Y Wen","year":"2024","unstructured":"Wen, Y., et al.: Vidman: exploiting implicit dynamics from video diffusion model for effective robot manipulation. Adv. Neural. Inf. Process. Syst. 37, 41051\u201341075 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR47","unstructured":"Wu, H., et al.: Unleashing large-scale video generative pre-training for visual robot manipulation. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"3_CR48","unstructured":"Xu, Z., et\u00a0al.: Mobility vla: multimodal instruction navigation with long-context vlms and topological graphs. In: 8th Annual Conference on Robot Learning (2024)"},{"key":"3_CR49","unstructured":"Yu, T., et al.: Meta-world: a benchmark and evaluation for multi-task and meta reinforcement learning. In: Conference on Robot Learning, pp. 1094\u20131100. PMLR (2020)"},{"key":"3_CR50","first-page":"56619","volume":"37","author":"Y Yue","year":"2024","unstructured":"Yue, Y., et al.: Deer-vla: dynamic inference of multimodal large language models for efficient robot execution. Adv. Neural. Inf. Process. Syst. 37, 56619\u201356643 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR51","doi-asserted-by":"crossref","unstructured":"Zhao, Q., et\u00a0al.: Cot-vla: visual chain-of-thought reasoning for vision-language-action models. In: Proceedings of the Computer Vision and Pattern Recognition Conference, pp. 1702\u20131713 (2025)","DOI":"10.1109\/CVPR52734.2025.00166"},{"key":"3_CR52","unstructured":"Zhen, H., et al.: 3d-vla: a 3d vision-language-action generative world model. In: Proceedings of the 41st International Conference on Machine Learning, pp. 61229\u201361245 (2024)"},{"issue":"1","key":"3_CR53","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1007\/s10458-023-09633-6","volume":"38","author":"C Zhu","year":"2024","unstructured":"Zhu, C., Dastani, M., Wang, S.: A survey of multi-agent deep reinforcement learning with communication. Auton. Agent. Multi-Agent Syst. 38(1), 4 (2024)","journal-title":"Auton. Agent. Multi-Agent Syst."},{"key":"3_CR54","unstructured":"Zitkovich, B., et\u00a0al.: Rt-2: Vision-language-action models transfer web knowledge to robotic control. In: Conference on Robot Learning, pp. 2165\u20132183. PMLR (2023)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5740-0_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T12:25:51Z","timestamp":1768220751000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5740-0_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819557394","9789819557400"],"references-count":54,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5740-0_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"13 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}