{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T10:20:00Z","timestamp":1772187600836,"version":"3.50.1"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032045515","type":"print"},{"value":"9783032045522","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T00:00:00Z","timestamp":1758585600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,23]],"date-time":"2025-09-23T00:00:00Z","timestamp":1758585600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04552-2_10","type":"book-chapter","created":{"date-parts":[[2025,9,22]],"date-time":"2025-09-22T14:15:04Z","timestamp":1758550504000},"page":"85-97","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Pointing-Guided Target Estimation via\u00a0Transformer-Based Attention"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-7682-1786","authenticated-orcid":false,"given":"Luca","family":"M\u00fcller","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9907-1834","authenticated-orcid":false,"given":"Hassan","family":"Ali","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2355-0764","authenticated-orcid":false,"given":"Philipp","family":"Allgeuer","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8646-2147","authenticated-orcid":false,"given":"Luk\u00e1\u0161","family":"Gajdo\u0161ech","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1343-4775","authenticated-orcid":false,"given":"Stefan","family":"Wermter","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,23]]},"reference":[{"key":"10_CR1","doi-asserted-by":"crossref","unstructured":"Ali, H., Allgeuer, P., Wermter, S.: Comparing apples to oranges: LLM-powered multimodal intention prediction in an object categorization task. In: ICSR (2024)","DOI":"10.1007\/978-981-96-3525-2_25"},{"key":"10_CR2","doi-asserted-by":"crossref","unstructured":"Allgeuer, P., Ali, H., Wermter, S.: When robots get chatty: grounding multimodal human-robot conversation and collaboration, pp. 306\u2013321. Springer (2024)","DOI":"10.1007\/978-3-031-72341-4_21"},{"key":"10_CR3","doi-asserted-by":"crossref","unstructured":"Antoun, M., Asmar, D.: Human object interaction detection: design and survey. Image Vis. Comput. 130(C) (2023)","DOI":"10.1016\/j.imavis.2022.104617"},{"key":"10_CR4","doi-asserted-by":"crossref","unstructured":"Azari, B., Lim, A., Vaughan, R.T.: Commodifying pointing in HRI: simple and fast pointing gesture detection from RGB-D images. In: Conference on Computer and Robot Vision (CRV), pp. 174\u2013180 (2019)","DOI":"10.1109\/CRV.2019.00031"},{"key":"10_CR5","unstructured":"Bamani, E., Nissinman, E., Koenigsberg, L., Meir, I., Matalon, Y., Sintov, A.: Recognition and estimation of human finger pointing with an RGB camera for robot directive. arXiv preprint arXiv:2307.02949 (2023)"},{"key":"10_CR6","doi-asserted-by":"crossref","unstructured":"Calli, B., Singh, A., Walsman, A., Srinivasa, S., Abbeel, P., Dollar, A.M.: The YCB object and model set: towards common benchmarks for manipulation research. In: International Conference on Advanced Robotics ICAR, pp. 510\u2013517 (2015)","DOI":"10.1109\/ICAR.2015.7251504"},{"issue":"01","key":"10_CR7","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1109\/TPAMI.2019.2929257","volume":"43","author":"Z Cao","year":"2021","unstructured":"Cao, Z., Hidalgo, G., Simon, T., Wei, S.E., Sheikh, Y.: OpenPose: realtime multi-person 2D pose estimation using part affinity fields. IEEE Trans. Pattern Anal. Mach. Int. 43(01), 172\u2013186 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Int."},{"key":"10_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-End object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"10_CR9","doi-asserted-by":"crossref","unstructured":"Chen, M., Liao, Y., Liu, S., Chen, Z., Wang, F., Qian, C.: Reformulating HOI detection as adaptive set prediction. In: CVPR, pp. 9004\u20139013 (2021)","DOI":"10.1109\/CVPR46437.2021.00889"},{"key":"10_CR10","doi-asserted-by":"crossref","unstructured":"Haque, F., Nancel, M., Vogel, D.: Myopoint: pointing and clicking using forearm mounted electromyography and inertial motion sensors. In: Proceedings of the 33rd Annual ACM Conference on Human Factors in Computing Systems, pp. 3653\u20133656. CHI \u201915, Association for Computing Machinery, NY, USA (2015)","DOI":"10.1145\/2702123.2702133"},{"key":"10_CR11","doi-asserted-by":"crossref","unstructured":"Hewe, G.W.: Gesture language in culture contact. In: Sign Language Studies 3, vol.\u00a04, pp. 1\u201334. Gallaudet University Press (1974)","DOI":"10.1353\/sls.1974.0010"},{"key":"10_CR12","doi-asserted-by":"crossref","unstructured":"Hu, K., Canavan, S., Yin, L.: Hand pointing estimation for human computer interaction based on two orthogonal-views. In: Proceedings of International Conference on Pattern Recognition, pp. 3760\u20133763 (2010)","DOI":"10.1109\/ICPR.2010.916"},{"key":"10_CR13","doi-asserted-by":"crossref","unstructured":"Ji, J., Desai, R., Niebles, J.C.: Detecting human-object relationships in videos. In: ICCV, pp. 8106\u20138116 (2021)","DOI":"10.1109\/ICCV48922.2021.00800"},{"key":"10_CR14","doi-asserted-by":"crossref","unstructured":"Kerzel, M., et al.: NICOL: a neuro-inspired collaborative semi-humanoid robot that bridges social interaction and reliable manipulation. IEEE Access 11 (2023)","DOI":"10.1109\/ACCESS.2023.3329370"},{"key":"10_CR15","doi-asserted-by":"crossref","unstructured":"Kim, B., Lee, J., Kang, J., Kim, E.S., Kim, H.J.: HOTR: end-to-end human-object interaction detection with transformers. In: CVPR, pp. 74\u201383 (2021)","DOI":"10.1109\/CVPR46437.2021.00014"},{"key":"10_CR16","volume-title":"Pointing: Where Language, Culture, and Cognition Meet","year":"2003","unstructured":"Kita, S. (ed.): Pointing: Where Language, Culture, and Cognition Meet. Lawrence Erlbaum Associates, Mahwah, NJ (2003)"},{"key":"10_CR17","doi-asserted-by":"crossref","unstructured":"Kuramochi, A., Komuro, T.: 3D hand pointing recognition over a wide area using two fisheye cameras, pp. 58\u201367. Springer (2021)","DOI":"10.1007\/978-3-030-90963-5_5"},{"key":"10_CR18","doi-asserted-by":"crossref","unstructured":"Lenz, F.: Deictic Conceptualisation of Space, Time and Person. John Benjamins (2003)","DOI":"10.1075\/pbns.112.02len"},{"key":"10_CR19","unstructured":"Lugaresi, C., et al.: MediaPipe: a framework for perceiving and processing reality. In: Third Workshop on Computer Vision for AR\/VR at IEEE CVPR 2019 (2019)"},{"key":"10_CR20","doi-asserted-by":"crossref","unstructured":"Maji, D., Nagori, S., Mathew, M., Poddar, D.: YOLO-pose: enhancing YOLO for multi person pose estimation using object keypoint similarity loss. In: CVPR Workshops, pp. 2637\u20132646 (2022)","DOI":"10.1109\/CVPRW56347.2022.00297"},{"issue":"1","key":"10_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1186\/s40648-021-00200-w","volume":"8","author":"ACS Medeiros","year":"2021","unstructured":"Medeiros, A.C.S., Ratsamee, P., Orlosky, J., Uranishi, Y., Higashida, M., Takemura, H.: 3D pointing gestures as target selection tools: guiding monocular UAVs during window selection in an outdoor environment. ROBOMECH J. 8(1), 1\u201319 (2021). https:\/\/doi.org\/10.1186\/s40648-021-00200-w","journal-title":"ROBOMECH J."},{"key":"10_CR22","unstructured":"Minderer, M., Gritsenko, A., Houlsby, N.: scaling open-vocabulary object detection. In: Proceedings of the 37th International Conference on Neural Information Processing Systems. NIPS \u201923, Curran Associates Inc., Red Hook, NY, USA (2023)"},{"issue":"5","key":"10_CR23","doi-asserted-by":"publisher","first-page":"1147","DOI":"10.1109\/TRO.2015.2463671","volume":"31","author":"R Mur-Artal","year":"2015","unstructured":"Mur-Artal, R., Montiel, J.M.M., Tardos, J.D.: ORB-SLAM: a versatile and accurate monocular SLAM system. IEEE Trans. Rob. 31(5), 1147\u20131163 (2015)","journal-title":"IEEE Trans. Rob."},{"issue":"03","key":"10_CR24","doi-asserted-by":"publisher","first-page":"1623","DOI":"10.1109\/TPAMI.2020.3019967","volume":"44","author":"R Ranftl","year":"2022","unstructured":"Ranftl, R., Lasinger, K., Hafner, D., Schindler, K., Koltun, V.: Towards robust monocular depth estimation: mixing datasets for zero-shot cross-dataset transfer. IEEE Trans. Pattern Anal. Mach. Int. 44(03), 1623\u20131637 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Int."},{"key":"10_CR25","doi-asserted-by":"crossref","unstructured":"Sikeridis, D., Antonakopoulos, T.: An IMU-based wearable system for automatic pointing during presentations. Image Process. Commun. 21 (2017)","DOI":"10.1515\/ipc-2016-0007"},{"key":"10_CR26","doi-asserted-by":"crossref","unstructured":"Tamura, M., Ohashi, H., Yoshinaga, T.: QPIC: query-based pairwise human-object interaction detection with image-wide contextual information. In: CVPR, pp. 10405\u201310414 (2021)","DOI":"10.1109\/CVPR46437.2021.01027"},{"key":"10_CR27","doi-asserted-by":"crossref","unstructured":"Tomasello, M.: Origins of Human Communication. MIT Press (2008)","DOI":"10.7551\/mitpress\/7551.001.0001"},{"key":"10_CR28","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s12369-017-0408-9","volume":"9","author":"M T\u00f6lgyessy","year":"2017","unstructured":"T\u00f6lgyessy, M., Dekan, M., Ducho\u0148, F., Rodina, J., Hubinsk\u00fd, P., Chovanec, L.: Foundations of visual linear human-robot interaction via pointing gesture navigation. Int. J. Soc. Robot. 9, 1\u201315 (2017)","journal-title":"Int. J. Soc. Robot."},{"key":"10_CR29","unstructured":"Zhang, A., et al.: Mining the benefits of two-stage and one-stage HOI detection. In: Ranzato, M., Beygelzimer, A., Dauphin, Y., Liang, P., Vaughan, J.W. (eds.) Advances in Neural Information Processing Systems, vol.\u00a034, pp. 17209\u201317220. Curran Associates, Inc. (2021)"},{"key":"10_CR30","unstructured":"Zhang, F., et al.: MediaPipe hands: on-device real-time hand tracking. arXiv preprint arXiv:2006.10214 (2020)"},{"key":"10_CR31","doi-asserted-by":"crossref","unstructured":"Zou, C., et al.: End-to-end human object interaction detection with HOI transformer. In: CVPR, pp. 11820\u201311829 (2021)","DOI":"10.1109\/CVPR46437.2021.01165"}],"container-title":["Lecture Notes in Computer Science","Artificial Neural Networks and Machine Learning. ICANN 2025 International Workshops and Special Sessions"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04552-2_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T09:22:12Z","timestamp":1772184132000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04552-2_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,23]]},"ISBN":["9783032045515","9783032045522"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04552-2_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,23]]},"assertion":[{"value":"23 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors declare that they have no conflict of interest.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ICANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kaunas","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lithuania","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"34","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icann2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/e-nns.org\/icann2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}