{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T05:14:48Z","timestamp":1773033288709,"version":"3.50.1"},"publisher-location":"Cham","reference-count":42,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783031084201","type":"print"},{"value":"9783031084218","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-08421-8_21","type":"book-chapter","created":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T21:02:38Z","timestamp":1658178158000},"page":"310-325","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["Vision-Based Holistic Scene Understanding for\u00a0Context-Aware Human-Robot Interaction"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3076-4509","authenticated-orcid":false,"given":"Giorgio","family":"De Magistris","sequence":"first","affiliation":[]},{"given":"Riccardo","family":"Caprari","sequence":"additional","affiliation":[]},{"given":"Giulia","family":"Castro","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1846-9996","authenticated-orcid":false,"given":"Samuele","family":"Russo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9057-8946","authenticated-orcid":false,"given":"Luca","family":"Iocchi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6606-200X","authenticated-orcid":false,"given":"Daniele","family":"Nardi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3336-5853","authenticated-orcid":false,"given":"Christian","family":"Napoli","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,7,19]]},"reference":[{"key":"21_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"593","DOI":"10.1007\/978-3-030-58558-7_35","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Diba","year":"2020","unstructured":"Diba, A., et al.: Large scale holistic video understanding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 593\u2013610. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_35"},{"issue":"1\u20132","key":"21_CR2","doi-asserted-by":"publisher","first-page":"116","DOI":"10.1016\/j.cviu.2006.10.019","volume":"108","author":"A Jaimes","year":"2007","unstructured":"Jaimes, A., Sebe, N.: Multimodal human-computer interaction: a survey. Comput. Vis. Image Underst. 108(1\u20132), 116\u2013134 (2007)","journal-title":"Comput. Vis. Image Underst."},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Bonanno, F., Capizzi, G., Coco, S., Napoli, C., Laudani, A., Sciuto, G.L.: Optimal thicknesses determination in a multilayer structure to improve the SPP efficiency for photovoltaic devices by an hybrid FEM - Cascade Neural Network based approach. In: 2014 International Symposium on Power Electronics. Electrical Drives, Automation and Motion, SPEEDAM, vol. 2014, pp. 355\u2013362 (2014)","DOI":"10.1109\/SPEEDAM.2014.6872103"},{"key":"21_CR4","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1016\/j.patrec.2014.04.011","volume":"48","author":"JK Aggarwal","year":"2014","unstructured":"Aggarwal, J.K., Xia, L.: Human activity recognition from 3D data: a review. Pattern Recogn. Lett. 48, 70\u201380 (2014)","journal-title":"Pattern Recogn. Lett."},{"key":"21_CR5","doi-asserted-by":"crossref","unstructured":"Li, W., Zhang, Z., Liu, Z.: Action recognition based on a bag of 3D points, Computer Vision and Pattern Recognition (CVPR) (2010)","DOI":"10.1109\/CVPRW.2010.5543273"},{"key":"21_CR6","unstructured":"Chen, L., Wei, H., Ferryman, J.: Tracking-based 3D human skeleton extraction from stereo video camera toward an on-site safety and ergonomic analysis. Computational Vision Group. School of Systems Engineering, University of Reading, UK (2013)"},{"issue":"3","key":"21_CR7","doi-asserted-by":"publisher","first-page":"348","DOI":"10.1108\/CI-10-2015-0054","volume":"16","author":"M Liu","year":"2016","unstructured":"Liu, M., Han, S., Lee, S.: A survey of human motion analysis using depth imagery. Constr. Innov. 16(3), 348\u2013367 (2016)","journal-title":"Constr. Innov."},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Ray, J., et al.: Scenes-objects-actions: a multi-task, multi-label video dataset. In: European Conference on Computer Vision (ECCV) (2018)","DOI":"10.1007\/978-3-030-01264-9_39"},{"key":"21_CR9","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. Center for Research in Computer Vision (CRCV) (2012)"},{"key":"21_CR10","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"21_CR11","unstructured":"Kay, W., et al.: The kinetics human action video dataset. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)"},{"key":"21_CR12","series-title":"Advances in Computer Vision and Pattern Recognition","doi-asserted-by":"publisher","first-page":"675","DOI":"10.1007\/978-3-319-28971-7_25","volume-title":"Context-Enhanced Information Fusion","author":"DD Bloisi","year":"2016","unstructured":"Bloisi, D.D., Nardi, D., Riccio, F., Trapani, F.: Context in robotics and information fusion. In: Snidaro, L., Garc\u00eda, J., Llinas, J., Blasch, E. (eds.) Context-Enhanced Information Fusion. ACVPR, pp. 675\u2013699. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-28971-7_25"},{"key":"21_CR13","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1016\/j.inffus.2015.01.002","volume":"25","author":"L Snidaro","year":"2015","unstructured":"Snidaro, L., Garc\u00eda, J., Llinas, J.: Context-based information fusion: a survey and discussion. Inf. Fusion 25, 16\u201331 (2015)","journal-title":"Inf. Fusion"},{"key":"21_CR14","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"469","DOI":"10.1007\/978-3-319-19324-3_42","volume-title":"Artificial Intelligence and Soft Computing","author":"BA Nowak","year":"2015","unstructured":"Nowak, B.A., Nowicki, R.K., Wo\u017aniak, M., Napoli, C.: Multi-class nearest neighbour classifier for incomplete data handling. In: Rutkowski, L., Korytkowski, M., Scherer, R., Tadeusiewicz, R., Zadeh, L.A., Zurada, J.M. (eds.) ICAISC 2015. LNCS (LNAI), vol. 9119, pp. 469\u2013480. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-19324-3_42"},{"key":"21_CR15","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marsza\u0142ek, M., Schmid, C., Rozenfeld, B.: Learning realistic human actions from movies. In: 26th IEEE Conference Computer Vision Pattern Recognition (CVPR) , pp. 1\u20138 (2008)","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: IEEE International Conference of Computer Vision (ICCV) (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"21_CR17","unstructured":"Abu-El-Haija, S., et al.: Youtube-8m: a large-scale video classification benchmark. CoRR, abs\/1609.08675 (2016)"},{"key":"21_CR18","unstructured":"Sensifai video tagging API. www.sensifai.com"},{"key":"21_CR19","unstructured":"Google vision AI API. https:\/\/cloud.google.com\/vision"},{"key":"21_CR20","unstructured":"TFRecord TensorFlow Tutorial. www.tensorflow.org\/tutorials\/load_data\/tfrecord"},{"key":"21_CR21","unstructured":"tf.data TensorFlow API. www.tensorflow.org\/api_docs\/python\/tf\/data"},{"issue":"41","key":"21_CR22","doi-asserted-by":"publisher","first-page":"30509","DOI":"10.1007\/s11042-020-09004-3","volume":"79","author":"DR Beddiar","year":"2020","unstructured":"Beddiar, D.R., Nini, B., Sabokrou, M., Hadid, A.: Vision-based human activity recognition: a survey. Multimedia Tools Appl. 79(41), 30509\u201330555 (2020). https:\/\/doi.org\/10.1007\/s11042-020-09004-3","journal-title":"Multimedia Tools Appl."},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2556\u20132563 (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"21_CR24","doi-asserted-by":"crossref","unstructured":"Karpathy, A. et al.: Large-scale video classification with convolutional neural networks (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"21_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"21_CR26","doi-asserted-by":"crossref","unstructured":"Cosgun, A., Christensen, H.I.: Context-aware robot navigation using interactively built semantic maps. Paladyn Journal of Behavioral Robotics (2018)","DOI":"10.1515\/pjbr-2018-0020"},{"key":"21_CR27","doi-asserted-by":"crossref","unstructured":"Zender, H., Jensfelt, P., Kruijff, G.: Human-and situation-aware people following. In: 16th IEEE International Symposium on Robot and Human interactive Communication (RO-MAN), pp. 1131\u20131136 (2007)","DOI":"10.1109\/ROMAN.2007.4415250"},{"key":"21_CR28","doi-asserted-by":"crossref","unstructured":"Pacchierotti, E., Christensen, H.I., Jensfelt, P.: Human-robot embodied interaction in hallway settings: a pilot user study. In: IEEE International Workshop on Robot and Human Interactive Communication (ROMAN), pp. 164\u2013171 (2005)","DOI":"10.1109\/ROMAN.2005.1513774"},{"issue":"1","key":"21_CR29","doi-asserted-by":"publisher","first-page":"227","DOI":"10.1109\/TSMC.2018.2833384","volume":"49","author":"J Quintas","year":"2018","unstructured":"Quintas, J., Martins, G.S., Santos, L., Menezes, P., Dias, J.: Toward a context-aware human-robot interaction framework based on cognitive development. IEEE Trans. Syst. Man Syst. Cybern. 49(1), 227\u2013237 (2018)","journal-title":"IEEE Trans. Syst. Man Syst. Cybern."},{"issue":"11","key":"21_CR30","doi-asserted-by":"publisher","first-page":"1615","DOI":"10.1109\/LSP.2018.2866926","volume":"25","author":"G Capizzi","year":"2018","unstructured":"Capizzi, G., Coco, S., Sciuto, G.L., Napoli, C.: A new iterative FIR filter design approach using a gaussian approximation. IEEE Sign. Process. Lett. 25(11), 1615\u20131619 (2018)","journal-title":"IEEE Sign. Process. Lett."},{"issue":"6088","key":"21_CR31","doi-asserted-by":"publisher","first-page":"533","DOI":"10.1038\/323533a0","volume":"323","author":"D Rumelhart","year":"1986","unstructured":"Rumelhart, D., Hinton, G., Williams, R.: Learning representations by back-propagating errors. Nature 323(6088), 533\u2013536 (1986)","journal-title":"Nature"},{"issue":"8","key":"21_CR32","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u201380 (1997)","journal-title":"Neural Comput."},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.:Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"21_CR34","unstructured":"Chung, J., Gulcehre, C., Cho, K., and Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)"},{"key":"21_CR35","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., and Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: IEEE International Conference on Computer Vision (ICCV), pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"issue":"1","key":"21_CR36","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji, S., Xu, W., Yang, M., Yu, K.: 3D convolutional neural networks for human action recognition. IEEE TPAMI 35(1), 221\u2013231 (2013)","journal-title":"IEEE TPAMI"},{"key":"21_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"622","DOI":"10.1007\/978-3-030-00776-8_57","volume-title":"Advances in Multimedia Information Processing \u2013 PCM 2018","author":"L Yao","year":"2018","unstructured":"Yao, L., Qian, Y.: DT-3DResNet-LSTM: an architecture for temporal activity recognition in videos. In: Hong, R., Cheng, W.-H., Yamasaki, T., Wang, M., Ngo, C.-W. (eds.) PCM 2018. LNCS, vol. 11164, pp. 622\u2013632. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-00776-8_57"},{"key":"21_CR38","doi-asserted-by":"publisher","unstructured":"Umamakeswari, A., Angelus, J., Kannan, M., Bragadeesh, S. A.: Action recognition using 3D CNN and LSTM for video analytics. In: International Conference on Intelligent Computing and Communication. Springer, Singapore (2020). https:\/\/doi.org\/10.1007\/978-981-15-1084-7_51","DOI":"10.1007\/978-981-15-1084-7_51"},{"issue":"5","key":"21_CR39","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s42979-020-00293-x","volume":"1","author":"R Alfaifi","year":"2020","unstructured":"Alfaifi, R., Artoli, A.M.: Human action prediction with 3D-CNN. SN Comput. Sci. 1(5), 1\u201315 (2020). https:\/\/doi.org\/10.1007\/s42979-020-00293-x","journal-title":"SN Comput. Sci."},{"issue":"6","key":"21_CR40","first-page":"599","volume":"20","author":"J Kim","year":"2014","unstructured":"Kim, J.: POMDP-based human-robot interaction behavior model. J. Inst. Control 20(6), 599\u2013605 (2014)","journal-title":"J. Inst. Control"},{"issue":"1","key":"21_CR41","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1007\/s12369-008-0001-3","volume":"1","author":"C Bartneck","year":"2009","unstructured":"Bartneck, C., Croft, E., Kulic, D., Zoghbi, S.: Measurement instruments for the anthropomorphism, animacy, likeability, perceived intelligence, and perceived safety of robots. Int. J. Soc. Robot. 1(1), 71\u201381 (2009). https:\/\/doi.org\/10.1007\/s12369-008-0001-3","journal-title":"Int. J. Soc. Robot."},{"key":"21_CR42","doi-asserted-by":"crossref","unstructured":"Pandey, A. K., and Gelin, R.: A mass-produced sociable humanoid robot: pepper: the first machine of its kind. In: IEEE Robotics & Automation Magazine (2018)","DOI":"10.1109\/MRA.2018.2833157"}],"container-title":["Lecture Notes in Computer Science","AIxIA 2021 \u2013 Advances in Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-08421-8_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T10:14:58Z","timestamp":1727604898000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-08421-8_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031084201","9783031084218"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-08421-8_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"19 July 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"AIxIA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference of the Italian Association for Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3 December 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"aiia2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/aixia2021.disco.unimib.it\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"58","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"36","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"62% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}