{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T17:27:49Z","timestamp":1775064469792,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":29,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819635245","type":"print"},{"value":"9789819635252","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T00:00:00Z","timestamp":1742860800000},"content-version":"vor","delay-in-days":83,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"abstract":"<jats:title>Abstract<\/jats:title>\n          <jats:p>Human intention-based systems enable robots to perceive and interpret user actions to interact with humans and adapt to their behavior proactively. Therefore, intention prediction is pivotal in creating a natural interaction with social robots in human-designed environments. In this paper, we examine using Large Language Models (LLMs) to infer human intention in a collaborative object categorization task with a physical robot. We propose a novel multimodal approach that integrates user non-verbal cues, like hand gestures, body poses, and facial expressions, with environment states and user verbal cues to predict user intentions in a hierarchical architecture. Our evaluation of five LLMs shows the potential for reasoning about verbal and non-verbal user cues, leveraging their context-understanding and real-world knowledge to support intention prediction while collaborating on a task with a social robot. <jats:bold>Video:<\/jats:bold>\n            <jats:ext-link xmlns:xlink=\"http:\/\/www.w3.org\/1999\/xlink\" xlink:href=\"https:\/\/youtu.be\/tBJHfAuzohI\" ext-link-type=\"uri\">https:\/\/youtu.be\/tBJHfAuzohI<\/jats:ext-link>\n          <\/jats:p>","DOI":"10.1007\/978-981-96-3525-2_25","type":"book-chapter","created":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T00:02:01Z","timestamp":1742860921000},"page":"292-306","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Comparing Apples to\u00a0Oranges: LLM-Powered Multimodal Intention Prediction in\u00a0an\u00a0Object Categorization Task"],"prefix":"10.1007","author":[{"given":"Hassan","family":"Ali","sequence":"first","affiliation":[]},{"given":"Philipp","family":"Allgeuer","sequence":"additional","affiliation":[]},{"given":"Stefan","family":"Wermter","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,25]]},"reference":[{"issue":"6","key":"25_CR1","doi-asserted-by":"publisher","first-page":"2014","DOI":"10.1007\/s12559-023-10174-z","volume":"15","author":"H Ali","year":"2023","unstructured":"Ali, H., Jirak, D., Wermter, S.: Snapture\u2013a novel neural architecture for combined static and dynamic hand gesture recognition. Cogn. Comput. 15(6), 2014\u20132033 (2023)","journal-title":"Cogn. Comput."},{"key":"25_CR2","unstructured":"AlKhamissi, B., Li, M., Celikyilmaz, A., Diab, M., Ghazvininejad, M.: A review on language models as knowledge bases. arXiv:2204.06031 (2022)"},{"key":"25_CR3","doi-asserted-by":"crossref","unstructured":"Allgeuer, P., Ali, H., Wermter, S.: When robots get chatty: grounding multimodal human-robot conversation and collaboration. In: ICANN 2024 (2024)","DOI":"10.1007\/978-3-031-72341-4_21"},{"key":"25_CR4","doi-asserted-by":"publisher","unstructured":"Cherakara, N., Varghese, F., et\u00a0al., S.S.: FurChat: an embodied conversational agent using LLMs, combining open and closed-domain dialogue with facial expressions. In: Proceedings of the 24th Annual Meeting of the Special Interest Group on Discourse and Dialogue, pp. 588\u2013592, September 2023. https:\/\/doi.org\/10.18653\/v1\/2023.sigdial-1.55","DOI":"10.18653\/v1\/2023.sigdial-1.55"},{"key":"25_CR5","doi-asserted-by":"crossref","unstructured":"Graule, M.A., Isler, V.: GG-LLM: Geometrically grounding large language models for zero-shot human activity forecasting in human-aware task planning. In: IEEE ICRA (2024)","DOI":"10.1109\/ICRA57147.2024.10611090"},{"key":"25_CR6","unstructured":"Gu, X., Lin, T.Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. In: ICLR (2022)"},{"key":"25_CR7","doi-asserted-by":"publisher","unstructured":"Habekost, J.G., Strahl, E., Allgeuer, P., Kerzel, M., Wermter, S.: Cycleik: neuro-inspired inverse kinematics. In: ICANN 2023, pp. 457\u2013470, September 2023. https:\/\/doi.org\/10.1007\/978-3-031-44207-0_38","DOI":"10.1007\/978-3-031-44207-0_38"},{"key":"25_CR8","doi-asserted-by":"publisher","DOI":"10.3389\/fcomp.2021.671012","volume":"3","author":"E Hildt","year":"2021","unstructured":"Hildt, E.: What sort of robots do we want to interact with? reflecting on the human side of human-artificial intelligence interaction. Front. Comput. Sci. 3, 671012 (2021). https:\/\/doi.org\/10.3389\/fcomp.2021.671012","journal-title":"Front. Comput. Sci."},{"key":"25_CR9","doi-asserted-by":"publisher","unstructured":"Jang, M., Yoon, Y., Choi, J., Ong, H., Kim, J.: A structured prompting based on belief-desire-intention model for proactive and explainable task planning. In: Proceedings of the 11th International Conference on Human-Agent Interaction, pp. 375\u2013377 (2023). https:\/\/doi.org\/10.1145\/3623809.3623930","DOI":"10.1145\/3623809.3623930"},{"key":"25_CR10","doi-asserted-by":"publisher","first-page":"123531","DOI":"10.1109\/ACCESS.2023.3329370","volume":"11","author":"M Kerzel","year":"2023","unstructured":"Kerzel, M., Allgeuer, P., Strahl, E., Frick, N., Habekost, J.G., Eppe, M., Wermter, S.: NICOL: a neuro-inspired collaborative semi-humanoid robot that bridges social interaction and reliable manipulation. IEEE Access 11, 123531\u2013123542 (2023). https:\/\/doi.org\/10.1109\/ACCESS.2023.3329370","journal-title":"IEEE Access"},{"issue":"2","key":"25_CR11","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/MRA.2021.3066040","volume":"28","author":"M Koskinopoulou","year":"2021","unstructured":"Koskinopoulou, M., Raptopoulos, F., et al.: Robotic waste sorting technology: toward a vision-based categorization system for the industrial robotic separation of recyclable waste. IEEE Robot. Autom. Mag. 28(2), 50\u201360 (2021). https:\/\/doi.org\/10.1109\/MRA.2021.3066040","journal-title":"IEEE Robot. Autom. Mag."},{"key":"25_CR12","unstructured":"Lee, Y.K., Jung, Y., Kang, G., Hahn, S.: Developing social robots with empathetic non-verbal cues using large language models. arXiv:2308.16529 (2023)"},{"key":"25_CR13","unstructured":"Liu, J., et al.: LLM-powered hierarchical language agent for real-time Human-AI coordination. In: 23rd International Conference on Autonomous Agents and Multiagent Systems, pp. 1219\u20131228 (2024)"},{"key":"25_CR14","doi-asserted-by":"publisher","unstructured":"Lubitz, A., Gutzeit, L., Kirchner, F.: CoBaIR: a python library for context-based intention recognition in human-robot-interaction. In: 2023 32nd IEEE RO-MAN, pp. 2003\u20132009 (2023). https:\/\/doi.org\/10.1109\/RO-MAN57019.2023.10309581","DOI":"10.1109\/RO-MAN57019.2023.10309581"},{"key":"25_CR15","unstructured":"Lugaresi, C., et\u00a0al.: MediaPipe: a framework for building perception pipelines. arXiv:1906.08172 (2019)"},{"key":"25_CR16","doi-asserted-by":"publisher","unstructured":"Pan, Z., Hauser, K.: Decision making in joint push-grasp action space for large-scale object sorting. In: IEEE ICRA, pp. 6199\u20136205 (2021). https:\/\/doi.org\/10.1109\/ICRA48506.2021.9560782","DOI":"10.1109\/ICRA48506.2021.9560782"},{"key":"25_CR17","unstructured":"Radford, A., Kim, J.W., Xu, T., et\u00a0al.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518 (2023)"},{"key":"25_CR18","doi-asserted-by":"publisher","unstructured":"Ramadurai, S., Jeong, H.: Effect of human involvement on work performance and fluency in human-robot collaboration for recycling. In: ACM\/IEEE International Conference on HRI, pp. 1007\u20131011 (2022). https:\/\/doi.org\/10.1109\/HRI53351.2022.9889606","DOI":"10.1109\/HRI53351.2022.9889606"},{"key":"25_CR19","doi-asserted-by":"publisher","unstructured":"Serfaty, G.J., Barnard, V.O., Salisbury, J.P.: Generative facial expressions and eye gaze behavior from prompts for multi-human-robot interaction. In: Adjunct Proceedings of the 36th Annual ACM Symposium on User Interface Software and Technology, NY, USA (2023). https:\/\/doi.org\/10.1145\/3586182.3616623","DOI":"10.1145\/3586182.3616623"},{"key":"25_CR20","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs. IEEE\/CVF CVPR, pp. 9568\u20139578 (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"25_CR21","doi-asserted-by":"publisher","unstructured":"Veselic, S., Zito, C., Farina, D.: Human-robot interaction with robust prediction of movement intention surpasses manual control. Front. Neurorobotics 15, 695022 (2021). https:\/\/doi.org\/10.3389\/fnbot.2021.695022","DOI":"10.3389\/fnbot.2021.695022"},{"key":"25_CR22","doi-asserted-by":"publisher","unstructured":"Wang, C., et al.: Lami: large language models for multi-modal human-robot interaction. In: Extended Abstracts of the CHI Conference on Human Factors in Computing Systems. CHI 2024, vol.\u00a02, pp. 1\u201310. ACM, May 2024. https:\/\/doi.org\/10.1145\/3613905.3651029","DOI":"10.1145\/3613905.3651029"},{"key":"25_CR23","unstructured":"Yao, S., et al.: ReAct: synergizing reasoning and acting in language models. In: ICLR (2023)"},{"issue":"2","key":"25_CR24","doi-asserted-by":"publisher","DOI":"10.1016\/j.hcc.2024.100211","volume":"4","author":"Y Yao","year":"2024","unstructured":"Yao, Y., Duan, J., Xu, K., Cai, Y., Sun, Z., Zhang, Y.: A survey on large language model (LLM) security and privacy: the good, the bad, and the ugly. High-Confid. Comput. 4(2), 100211 (2024)","journal-title":"High-Confid. Comput."},{"key":"25_CR25","unstructured":"Yoshida, T., Masumori, A., Ikegami, T.: From text to motion: Grounding GPT-4 in a humanoid robot \u201cAlter3\u201d. arXiv:abs\/2312.06571 (2023)"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Zhang, B., Soh, H.: Large language models as zero-shot human models for human-robot interaction. In: IEEE\/RSJ IROS, pp. 7961\u20137968 (2023)","DOI":"10.1109\/IROS55552.2023.10341488"},{"key":"25_CR27","unstructured":"Zhang, M.J.Q., Choi, E.: Clarify when necessary: Resolving ambiguity through interaction with LMs. arXiv:abs\/2311.09469 (2023)"},{"key":"25_CR28","doi-asserted-by":"publisher","unstructured":"Zhang, Y., Doyle, T.: Integrating intention-based systems in human-robot interaction: a scoping review of sensors, algorithms, and trust. Front. Robot. AI 10, 1233328 (2023). https:\/\/doi.org\/10.3389\/frobt.2023.1233328","DOI":"10.3389\/frobt.2023.1233328"},{"key":"25_CR29","unstructured":"Zhao, Q., Wang, S., Zhang, C., et\u00a0al.: AntGPT: can large language models help long-term action anticipation from videos? In: ICLR (2024)"}],"container-title":["Lecture Notes in Computer Science","Social Robotics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-3525-2_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T08:13:06Z","timestamp":1757146386000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-3525-2_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819635245","9789819635252"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-3525-2_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"25 March 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICSR + AI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Social Robotics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Odense","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Denmark","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"socrob2024a","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icsr2024.dk","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}