{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,26]],"date-time":"2025-09-26T17:07:22Z","timestamp":1758906442921,"version":"3.40.3"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031533013"},{"type":"electronic","value":"9783031533020"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-53302-0_9","type":"book-chapter","created":{"date-parts":[[2024,1,28]],"date-time":"2024-01-28T09:02:09Z","timestamp":1706432529000},"page":"117-131","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["IFI: Interpreting for\u00a0Improving: A Multimodal Transformer with\u00a0an\u00a0Interpretability Technique for\u00a0Recognition of\u00a0Risk Events"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3335-2753","authenticated-orcid":false,"given":"Rupayan","family":"Mallick","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0659-8894","authenticated-orcid":false,"given":"Jenny","family":"Benois-Pineau","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9776-0449","authenticated-orcid":false,"given":"Akka","family":"Zemmari","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,1,29]]},"reference":[{"issue":"5","key":"9_CR1","doi-asserted-by":"publisher","first-page":"050901","DOI":"10.1117\/1.JEI.30.5.050901","volume":"30","author":"MP Ayyar","year":"2021","unstructured":"Ayyar, M.P., Benois-Pineau, J., Zemmari, A.: Review of white box methods for explanations of convolutional neural networks in image classification tasks. J. Electron. Imaging 30(5), 050901 (2021)","journal-title":"J. Electron. Imaging"},{"issue":"7","key":"9_CR2","doi-asserted-by":"publisher","first-page":"e0130140","DOI":"10.1371\/journal.pone.0130140","volume":"10","author":"S Bach","year":"2015","unstructured":"Bach, S., Binder, A., Montavon, G., Klauschen, F., M\u00fcller, K.R., Samek, W.: On pixel-wise explanations for non-linear classifier decisions by layer-wise relevance propagation. PLoS ONE 10(7), e0130140 (2015)","journal-title":"PLoS ONE"},{"key":"9_CR3","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML (2021)"},{"key":"9_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on CVPR 2017","DOI":"10.1109\/CVPR.2017.502"},{"key":"9_CR6","doi-asserted-by":"crossref","unstructured":"Chefer, H., Gur, S., Wolf, L.: Transformer interpretability beyond attention visualization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 782\u2013791, June 2021","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"9_CR7","unstructured":"Chefer, H., Schwartz, I., Wolf, L.: Optimizing relevance maps of vision transformers improves robustness. In: NeuRIPS 2022"},{"key":"9_CR8","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL (2019)"},{"key":"9_CR9","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR 2021"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3d: expanding architectures for efficient video recognition. In: 2020 IEEE\/CVF Conference on CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Guo, X., Guo, X., Lu, Y.: SSAN: separable self-attention network for video representation learning. In: Proceedings of the IEEE\/CVF CVPR 2021","DOI":"10.1109\/CVPR46437.2021.01243"},{"key":"9_CR12","doi-asserted-by":"crossref","unstructured":"Li, Q., Qiu, Z., Yao, T., Mei, T., Rui, Y., Luo, J.: Action recognition by learning deep multi-granular spatio-temporal video representation. In: Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval. ACM (2016)","DOI":"10.1145\/2911996.2912001"},{"key":"9_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"622","DOI":"10.1007\/978-3-030-58536-5_37","volume-title":"Computer Vision \u2013 ECCV 2020","author":"H Liang","year":"2020","unstructured":"Liang, H., et al.: Training interpretable convolutional neural networks by differentiating class-specific filters. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 622\u2013638. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_37"},{"key":"9_CR14","unstructured":"Liu, K., Li, Y., Xu, N., Natarajan, P.: Learn to combine modalities in multimodal deep learning. ArXiv abs\/1805.11730 (2018)"},{"key":"9_CR15","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10012\u201310022, October 2021","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"9_CR16","unstructured":"Liu, Z., et al.: Video swin transformer. arXiv preprint arXiv:2106.13230 (2021)"},{"key":"9_CR17","unstructured":"Lundberg, S.M., Lee, S.I.: A unified approach to interpreting model predictions. In: Guyon, I., et al. (eds.) Advances in NeuRIPS 2017"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Mallick, R., Benois-Pineau, J., Zemmari, A.: I saw: a self-attention weighted method for explanation of visual transformers. In: IEEE ICIP 2022","DOI":"10.1109\/ICIP46576.2022.9897347"},{"key":"9_CR19","doi-asserted-by":"crossref","unstructured":"Mallick, R., et al.: Pooling transformer for detection of risk events in in-the-wild video ego data. In: 26th ICPR 2022","DOI":"10.1109\/ICPR56361.2022.9956675"},{"issue":"1","key":"9_CR20","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1109\/MMUL.2022.3147381","volume":"29","author":"R Mallick","year":"2022","unstructured":"Mallick, R., Yebda, T., Benois-Pineau, J., Zemmari, A., Pech, M., Amieva, H.: Detection of risky situations for frail adults with hybrid neural networks on multimodal health data. IEEE Multim. 29(1), 7\u201317 (2022)","journal-title":"IEEE Multim."},{"key":"9_CR21","doi-asserted-by":"publisher","first-page":"169","DOI":"10.1016\/j.jvcir.2018.01.009","volume":"51","author":"G Meditskos","year":"2018","unstructured":"Meditskos, G., Plans, P., Stavropoulos, T.G., Benois-Pineau, J., Buso, V., Kompatsiaris, I.: Multi-modal activity recognition from egocentric vision, semantic enrichment and lifelogging applications for the care of dementia. J. Vis. Commun. Image Represent. 51, 169\u2013190 (2018)","journal-title":"J. Vis. Commun. Image Represent."},{"key":"9_CR22","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1016\/j.patcog.2016.11.008","volume":"65","author":"G Montavon","year":"2017","unstructured":"Montavon, G., Lapuschkin, S., Binder, A., Samek, W., M\u00fcller, K.R.: Explaining nonlinear classification decisions with deep Taylor decomposition. Pattern Recogn. 65, 211\u2013222 (2017)","journal-title":"Pattern Recogn."},{"key":"9_CR23","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., Ng, A.Y.: Multimodal deep learning. In: Proceedings of the 28th ICML 2011"},{"key":"9_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"639","DOI":"10.1007\/978-3-030-01231-1_39","volume-title":"Computer Vision \u2013 ECCV 2018","author":"A Owens","year":"2018","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 639\u2013658. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_39"},{"key":"9_CR25","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision (2021)"},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Ribeiro, M.T., Singh, S., Guestrin, C.: \u201cWhy should I trust you?: Explaining the predictions of any classifier. In: KDD, pp. 1135\u20131144. ACM (2016)","DOI":"10.1145\/2939672.2939778"},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-CAM: visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE\/CVF ICCV (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"9_CR28","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: NIPS (2014)"},{"key":"9_CR29","unstructured":"Springenberg, J., Dosovitskiy, A., Brox, T., Riedmiller, M.: Striving for simplicity: the all convolutional net, December 2014"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Srinivas, A., Lin, T.Y., Parmar, N., Shlens, J., Abbeel, P., Vaswani, A.: Bottleneck transformers for visual recognition. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"9_CR31","unstructured":"Srinivas, S., Fleuret, F.: Full-gradient representation for neural network visualization. In: Advances in Neural Information Processing Systems (2019)"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 4489\u20134497, December 2015","DOI":"10.1109\/ICCV.2015.510"},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Feiszli, M.: Video classification with channel-separated convolutional networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), October 2019","DOI":"10.1109\/ICCV.2019.00565"},{"key":"9_CR34","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"9_CR35","unstructured":"Wang, S., Li, B.Z., Khabsa, M., Fang, H., Ma, H.: Linformer: self-attention with linear complexity (2020). https:\/\/arxiv.org\/abs\/2006.04768"},{"key":"9_CR36","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2018","DOI":"10.1109\/CVPR.2018.00813"},{"key":"9_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"342","DOI":"10.1007\/978-3-030-67835-7_29","volume-title":"MultiMedia Modeling","author":"T Yebda","year":"2021","unstructured":"Yebda, T., Benois-Pineau, J., Pech, M., Amieva, H., Middleton, L., Bergelt, M.: Multimodal sensor data analysis for\u00a0detection of risk situations of fragile people in @home environments. In: Loko\u010d, J., et al. (eds.) MMM 2021. LNCS, vol. 12573, pp. 342\u2013353. Springer, Cham (2021). https:\/\/doi.org\/10.1007\/978-3-030-67835-7_29"},{"key":"9_CR38","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"818","DOI":"10.1007\/978-3-319-10590-1_53","volume-title":"Computer Vision \u2013 ECCV 2014","author":"MD Zeiler","year":"2014","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional networks. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8689, pp. 818\u2013833. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10590-1_53"},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Zhang, H., Torres, F., Sicre, R., Avrithis, Y., Ayache, S.: Opti-CAM: optimizing saliency maps for interpretability. CoRR abs\/2301.07002 (2023)","DOI":"10.2139\/ssrn.4476687"},{"key":"9_CR40","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2921\u20132929 (2016)","DOI":"10.1109\/CVPR.2016.319"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-53302-0_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T11:57:05Z","timestamp":1709812625000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-53302-0_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031533013","9783031533020"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-53302-0_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"29 January 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 January 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 February 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"ConfTool Pro","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"297","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"112","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"38% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}