{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T03:43:15Z","timestamp":1742960595955,"version":"3.40.3"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031431470"},{"type":"electronic","value":"9783031431487"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-43148-7_15","type":"book-chapter","created":{"date-parts":[[2023,9,4]],"date-time":"2023-09-04T20:48:35Z","timestamp":1693860515000},"page":"171-183","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards Explainable Navigation and\u00a0Recounting"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8428-501X","authenticated-orcid":false,"given":"Samuele","family":"Poppi","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6457-1860","authenticated-orcid":false,"given":"Roberto","family":"Bigazzi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4142-0488","authenticated-orcid":false,"given":"Niyati","family":"Rawal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9640-9385","authenticated-orcid":false,"given":"Marcella","family":"Cornia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7885-6050","authenticated-orcid":false,"given":"Silvia","family":"Cascianelli","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5125-4957","authenticated-orcid":false,"given":"Lorenzo","family":"Baraldi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2239-283X","authenticated-orcid":false,"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,9,5]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"15_CR2","unstructured":"Anjomshoae, S., Najjar, A., Calvaresi, D., Fr\u00e4mling, K.: Explainable agents and robots: results from a systematic literature review. In: AAMAS (2019)"},{"key":"15_CR3","doi-asserted-by":"crossref","unstructured":"Bigazzi, R., Cornia, M., Cascianelli, S., Baraldi, L., Cucchiara, R.: Embodied agents for efficient exploration and smart scene description. In: ICRA (2023)","DOI":"10.1109\/ICRA48891.2023.10160668"},{"issue":"2","key":"15_CR4","first-page":"2985","volume":"7","author":"R Bigazzi","year":"2022","unstructured":"Bigazzi, R., Landi, F., Cascianelli, S., Baraldi, L., Cornia, M., Cucchiara, R.: Focus on impact: indoor exploration with intrinsic motivation. RA-L 7(2), 2985\u20132992 (2022)","journal-title":"RA-L"},{"key":"15_CR5","unstructured":"Bigazzi, R., Landi, F., Cornia, M., Cascianelli, S., Baraldi, L., Cucchiara, R.: Explore and explain: self-supervised navigation and recounting. In: ICPR (2020)"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Bigazzi, R., Landi, F., Cornia, M., Cascianelli, S., Baraldi, L., Cucchiara, R.: Out of the box: embodied navigation in the real world. In: CAIP (2021)","DOI":"10.1007\/978-3-030-89128-2_5"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Bolelli, F., Baraldi, L., Pollastri, F., Grana, C.: A hierarchical quasi-recurrent approach to video captioning. In: IPAS (2018)","DOI":"10.1109\/IPAS.2018.8708893"},{"key":"15_CR8","unstructured":"Burda, Y., Edwards, H., Pathak, D., Storkey, A., Darrell, T., Efros, A.A.: Large-scale study of curiosity-driven learning. arXiv preprint arXiv:1808.04355 (2018)"},{"key":"15_CR9","doi-asserted-by":"crossref","unstructured":"Chang, A., et al.: Matterport3D: learning from RGB-D data in indoor environments. In: 3DV (2017)","DOI":"10.1109\/3DV.2017.00081"},{"key":"15_CR10","unstructured":"Chaplot, D.S., Gandhi, D., Gupta, S., Gupta, A., Salakhutdinov, R.: Learning to explore using active neural SLAM. In: ICLR (2019)"},{"key":"15_CR11","unstructured":"Chaplot, D.S., Gandhi, D.P., Gupta, A., Salakhutdinov, R.R.: Object goal navigation using goal-oriented semantic exploration. In: NeurIPS (2020)"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"Cornia, M., Baraldi, L., Cucchiara, R.: SMArT: training shallow memory-aware transformers for robotic explainability. In: ICRA (2020)","DOI":"10.1109\/ICRA40945.2020.9196653"},{"issue":"2","key":"15_CR13","doi-asserted-by":"publisher","first-page":"111","DOI":"10.3233\/AIC-210172","volume":"35","author":"M Cornia","year":"2022","unstructured":"Cornia, M., Baraldi, L., Cucchiara, R.: Explaining transformer-based image captioning models: an empirical analysis. AI Commun. 35(2), 111\u2013129 (2022)","journal-title":"AI Commun."},{"key":"15_CR14","unstructured":"Cornia, M., Baraldi, L., Fiameni, G., Cucchiara, R.: Universal captioner: inducing content-style separation in vision-and-language model training. arXiv preprint arXiv:2111.12727 (2022)"},{"key":"15_CR15","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"15_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-319-46493-0_1","volume-title":"Computer Vision \u2013 ECCV 2016","author":"LA Hendricks","year":"2016","unstructured":"Hendricks, L.A., Akata, Z., Rohrbach, M., Donahue, J., Schiele, B., Darrell, T.: Generating visual explanations. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 3\u201319. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_1"},{"key":"15_CR17","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: CLIPScore: a reference-free evaluation metric for image captioning. In: EMNLP (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.Y.: Attention on attention for image captioning. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"15_CR19","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"15_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"280","DOI":"10.1007\/978-3-031-19775-8_17","volume-title":"Computer Vision \u2013 ECCV 2016","author":"SS Kim","year":"2022","unstructured":"Kim, S.S., Meister, N., Ramaswamy, V.V., Fong, R., Russakovsky, O.: HIVE: evaluating the human interpretability of visual explanations. In: Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2016. LNCS, vol. 13672, pp. 280\u2013298. Springer, Heidelberg (2022). https:\/\/doi.org\/10.1007\/978-3-031-19775-8_17"},{"key":"15_CR21","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"key":"15_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"15_CR23","doi-asserted-by":"crossref","unstructured":"Lovino, M., Bontempo, G., Cirrincione, G., Ficarra, E.: Multi-omics classification on kidney samples exploiting uncertainty-aware models. In: ICIC (2020)","DOI":"10.1007\/978-3-030-60802-6_4"},{"key":"15_CR24","doi-asserted-by":"crossref","unstructured":"Pathak, D., Agrawal, P., Efros, A.A., Darrell, T.: Curiosity-driven exploration by self-supervised prediction. In: ICML (2017)","DOI":"10.1109\/CVPRW.2017.70"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: GloVe: global vectors for word representation. In: EMNLP (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"15_CR26","doi-asserted-by":"crossref","unstructured":"Poppi, S., Cornia, M., Baraldi, L., Cucchiara, R.: Revisiting the evaluation of class activation mapping for explainability: a novel metric and experimental analysis. In: CVPR Workshops (2021)","DOI":"10.1109\/CVPRW53098.2021.00260"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Poppi, S., Sarto, S., Cornia, M., Baraldi, L., Cucchiara, R.: Multi-class explainable unlearning for image classification via weight filtering. arXiv preprint arXiv:2304.02049 (2023)","DOI":"10.1109\/MIS.2024.3412742"},{"key":"15_CR28","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"15_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"400","DOI":"10.1007\/978-3-030-58558-7_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"SK Ramakrishnan","year":"2020","unstructured":"Ramakrishnan, S.K., Al-Halah, Z., Grauman, K.: Occupancy anticipation for efficient exploration and navigation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 400\u2013418. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_24"},{"key":"15_CR30","doi-asserted-by":"publisher","first-page":"1616","DOI":"10.1007\/s11263-021-01437-z","volume":"129","author":"SK Ramakrishnan","year":"2021","unstructured":"Ramakrishnan, S.K., Jayaraman, D., Grauman, K.: An exploration of embodied visual exploration. IJCV 129, 1616\u20131649 (2021)","journal-title":"IJCV"},{"key":"15_CR31","doi-asserted-by":"crossref","unstructured":"Sarto, S., Barraco, M., Cornia, M., Baraldi, L., Cucchiara, R.: Positive-augmented contrastive learning for image and video captioning evaluation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00668"},{"key":"15_CR32","doi-asserted-by":"crossref","unstructured":"Sarto, S., Cornia, M., Baraldi, L., Cucchiara, R.: Retrieval-augmented transformer for image captioning. In: CBMI (2022)","DOI":"10.1145\/3549555.3549585"},{"key":"15_CR33","doi-asserted-by":"crossref","unstructured":"Savva, M., et al.: Habitat: a platform for embodied AI research. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00943"},{"key":"15_CR34","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)"},{"key":"15_CR35","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-CAM: visual explanations from deep networks via gradient-based localization. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"15_CR36","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., Birch, A.: neural machine translation of rare words with subword units. In: ACL (2016)","DOI":"10.18653\/v1\/P16-1162"},{"key":"15_CR37","unstructured":"Simonyan, K., Vedaldi, A., Zisserman, A.: Deep inside convolutional networks: visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034 (2013)"},{"key":"15_CR38","unstructured":"Springenberg, J.T., Dosovitskiy, A., Brox, T., Riedmiller, M.: Striving for simplicity: the all convolutional net. arXiv preprint arXiv:1412.6806 (2014)"},{"issue":"1","key":"15_CR39","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1109\/TPAMI.2022.3148210","volume":"45","author":"M Stefanini","year":"2022","unstructured":"Stefanini, M., Cornia, M., Baraldi, L., Cascianelli, S., Fiameni, G., Cucchiara, R.: From show to tell: a survey on deep learning-based image captioning. IEEE Trans. PAMI 45(1), 539\u2013559 (2022)","journal-title":"IEEE Trans. PAMI"},{"key":"15_CR40","unstructured":"Sundararajan, M., Taly, A., Yan, Q.: Axiomatic attribution for deep networks. In: ICML (2017)"},{"key":"15_CR41","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"15_CR42","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: Score-CAM: score-weighted visual explanations for convolutional neural networks. In: CVPR Workshops (2020)","DOI":"10.1109\/CVPRW50498.2020.00020"},{"key":"15_CR43","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML (2015)"},{"key":"15_CR44","unstructured":"You, Y., et al.: Large batch optimization for deep learning: training bert in 76 minutes. In: ICLR (2019)"},{"key":"15_CR45","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"818","DOI":"10.1007\/978-3-319-10590-1_53","volume-title":"Computer Vision \u2013 ECCV 2014","author":"MD Zeiler","year":"2014","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional networks. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8689, pp. 818\u2013833. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10590-1_53"},{"key":"15_CR46","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.319"}],"container-title":["Lecture Notes in Computer Science","Image Analysis and Processing \u2013 ICIAP 2023"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-43148-7_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T13:24:18Z","timestamp":1730035458000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-43148-7_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031431470","9783031431487"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-43148-7_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"5 September 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIAP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Image Analysis and Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Udine","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 September 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iciap2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.iciap2023.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"144","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"85","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"59% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"https:\/\/iciap2023.org\/satellite-event\/workshops\/","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}