{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T19:47:06Z","timestamp":1770752826217,"version":"3.50.0"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"35","license":[{"start":{"date-parts":[[2024,3,11]],"date-time":"2024-03-11T00:00:00Z","timestamp":1710115200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,3,11]],"date-time":"2024-03-11T00:00:00Z","timestamp":1710115200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-024-18687-x","type":"journal-article","created":{"date-parts":[[2024,3,11]],"date-time":"2024-03-11T14:12:20Z","timestamp":1710166340000},"page":"83339-83356","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["A hybrid transformer with domain adaptation using interpretability techniques for the application to the detection of risk situations"],"prefix":"10.1007","volume":"83","author":[{"given":"Rupayan","family":"Mallick","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0659-8894","authenticated-orcid":false,"given":"Jenny","family":"Benois-Pineau","sequence":"additional","affiliation":[]},{"given":"Akka","family":"Zemmari","sequence":"additional","affiliation":[]},{"given":"Kamel","family":"Guerda","sequence":"additional","affiliation":[]},{"given":"Boris","family":"Mansencal","sequence":"additional","affiliation":[]},{"given":"Helene","family":"Amieva","sequence":"additional","affiliation":[]},{"given":"Laura","family":"Middleton","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,3,11]]},"reference":[{"key":"18687_CR1","doi-asserted-by":"crossref","unstructured":"Qiu X, Sun T, Xu Y, Shao Y, Dai N, Huang X (2020) Pre-trained models for natural language processing: a survey. arXiv:2003.08271","DOI":"10.1007\/s11431-020-1647-3"},{"issue":"01","key":"18687_CR2","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1109\/TPAMI.2022.3152247","volume":"45","author":"K Han","year":"2023","unstructured":"Han K, Wang Y, Chen H, Chen X, Guo J, Liu Z, Tang Y, Xiao A, Xu C, Xu Y, Yang Z, Zhang Y, Tao D (2023) A survey on vision transformer. IEEE Trans Pattern Anal Mach Intell 45(01):87\u2013110. https:\/\/doi.org\/10.1109\/TPAMI.2022.3152247","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"18687_CR3","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, Uszkoreit J, Houlsby N (2021) An image is worth 16x16 words: transformers for image recognition at scale. In: 9th International conference on learning representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"18687_CR4","doi-asserted-by":"publisher","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2019) BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics: human language technologies, vol 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota. https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423"},{"issue":"3","key":"18687_CR5","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky O, Deng J, Su H, Krause J, Satheesh S, Ma S, Huang Z, Karpathy A, Khosla A, Bernstein M, Berg AC, Fei-Fei L (2015) ImageNet large scale visual recognition challenge. International Journal of Computer Vision (IJCV) 115(3):211\u2013252. https:\/\/doi.org\/10.1007\/s11263-015-0816-y","journal-title":"International Journal of Computer Vision (IJCV)"},{"issue":"1","key":"18687_CR6","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1109\/MMUL.2022.3147381","volume":"29","author":"R Mallick","year":"2022","unstructured":"Mallick R, Yebda T, Benois-Pineau J, Zemmari A, Pech M, Amieva H (2022) Detection of risky situations for frail adults with hybrid neural networks on multimodal health data. IEEE Multim 29(1):7\u201317","journal-title":"IEEE Multim"},{"key":"18687_CR7","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2017) Attention is all you need. In: NIPS, pp 5998\u20136008"},{"key":"18687_CR8","unstructured":"Liu Y, Ott M, Goyal N, Du J, Joshi M, Chen D, Levy O, Lewis M, Zettlemoyer L, Stoyanov V (2020) RoBERTa: a robustly optimized BERT pretraining approach. https:\/\/openreview.net\/forum?id=SyxS0T4tvS"},{"key":"18687_CR9","doi-asserted-by":"crossref","unstructured":"Tran D, Wang H, Torresani L, Feiszli M (2019) Video classification with channel-separated convolutional networks. In: Proceedings of the IEEE\/CVF international conference on computer vision (ICCV)","DOI":"10.1109\/ICCV.2019.00565"},{"key":"18687_CR10","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer vision - ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S (2020) End-to-end object detection with transformers. In: Vedaldi A, Bischof H, Brox T, Frahm J-M (eds) Computer vision - ECCV 2020. Springer, Cham, pp 213\u2013229"},{"key":"18687_CR11","unstructured":"Bertasius G, Wang H, Torresani L (2021) Is space-time attention all you need for video understanding? In: Proceedings of the International Conference on Machine Learning (ICML)"},{"key":"18687_CR12","doi-asserted-by":"crossref","unstructured":"Arnab A, Dehghani M, Heigold G, Sun C, Lu\u010di\u0107 M, Schmid C (2021) Vivit: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp 6836\u20136846","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"18687_CR13","doi-asserted-by":"publisher","unstructured":"Li Q, Qiu Z, Yao T, Mei T, Rui Y, Luo J (2016) Action recognition by learning deep multi-granular spatio-temporal video representation. In: Proceedings of the 2016 ACM on international conference on multimedia retrieval. ICMR \u201916, pp 159\u2013166. Association for Computing Machinery, New York, USA. https:\/\/doi.org\/10.1145\/2911996.2912001","DOI":"10.1145\/2911996.2912001"},{"key":"18687_CR14","doi-asserted-by":"publisher","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks, pp 4489\u20134497. https:\/\/doi.org\/10.1109\/ICCV.2015.510","DOI":"10.1109\/ICCV.2015.510"},{"issue":"1","key":"18687_CR15","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji S, Xu W, Yang M, Yu K (2013) 3d convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35(1):221\u2013231. https:\/\/doi.org\/10.1109\/TPAMI.2012.59","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"18687_CR16","doi-asserted-by":"crossref","unstructured":"Guo X, Guo X, Lu Y (2021) Ssan: separable self-attention network for video representation learning. In: Proceedings of the IEEE\/CVF conference on Computer Vision and Pattern Recognition (CVPR), pp 12618\u201312627","DOI":"10.1109\/CVPR46437.2021.01243"},{"key":"18687_CR17","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, Krueger G, Sutskever I (2021) Learning transferable visual models from natural language supervision. arXiv:2103.00020"},{"key":"18687_CR18","doi-asserted-by":"publisher","unstructured":"Owens A, Efros AA (2018) Audio-visual scene analysis with self-supervised multisensory features. In: Computer vision \u2013 ECCV 2018: 15th European conference, Munich, Germany, September 8\u201314, 2018, Proceedings, Part VI, pp 639\u2013658. Springer, Berlin, Heidelberg. https:\/\/doi.org\/10.1007\/978-3-030-01231-1_39","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"18687_CR19","doi-asserted-by":"publisher","first-page":"169","DOI":"10.1016\/j.jvcir.2018.01.009","volume":"51","author":"G Meditskos","year":"2018","unstructured":"Meditskos G, Plans P, Stavropoulos TG, Benois-Pineau J, Buso V, Kompatsiaris I (2018) Multi-modal activity recognition from egocentric vision, semantic enrichment and lifelogging applications for the care of dementia. J Vis Commun Image Represent 51:169\u2013190","journal-title":"J Vis Commun Image Represent"},{"key":"18687_CR20","unstructured":"Ngiam J, Khosla A, Kim M, Nam J, Lee H, Ng AY (2011) Multimodal deep learning. In: Proceedings of the 28th international conference on international conference on machine learning. ICML\u201911, pp 689\u2013696. Omnipress, Madison, WI, USA"},{"key":"18687_CR21","doi-asserted-by":"publisher","unstructured":"Tsai Y-HH, Bai S, Liang PP, Kolter JZ, Morency L-P, Salakhutdinov R (2019) Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the 57th annual meeting of the association for computational linguistics, pp 6558\u20136569. Association for Computational Linguistics, Florence, Italy. https:\/\/doi.org\/10.18653\/v1\/P19-1656","DOI":"10.18653\/v1\/P19-1656"},{"key":"18687_CR22","doi-asserted-by":"crossref","unstructured":"Zhou L, Gurrin C (2022) Multimodal embedding for lifelog retrieval. In: MMM (1). Lecture Notes in Computer Science, vol 13141, pp 416\u2013427. Springer","DOI":"10.1007\/978-3-030-98358-1_33"},{"key":"18687_CR23","unstructured":"Goodfellow IJ, Bengio Y, Courville AC (2016) Deep Learning. Adaptive computation and machine learning. MIT Press"},{"key":"18687_CR24","doi-asserted-by":"crossref","unstructured":"Oquab M, Bottou L, Laptev I, Sivic J (2014) Learning and transferring mid-level image representations using convolutional neural networks. In: CVPR, pp 1717\u20131724. IEEE Computer Society","DOI":"10.1109\/CVPR.2014.222"},{"key":"18687_CR25","doi-asserted-by":"crossref","unstructured":"Kolesnikov A, Beyer L, Zhai X, Puigcerver J, Yung J, Gelly S, Houlsby N (2020) Big transfer (bit): general visual representation learning. In: ECCV (5). Lecture Notes in Computer Science, vol 12350, pp 491\u2013507. Springer","DOI":"10.1007\/978-3-030-58558-7_29"},{"key":"18687_CR26","doi-asserted-by":"publisher","unstructured":"Improving alzheimer\u2019s stage categorization with convolutional neural network using transfer learning and different magnetic resonance imaging modalities. Heliyon 6(12), 05652 (2020). https:\/\/doi.org\/10.1016\/j.heliyon.2020.e05652","DOI":"10.1016\/j.heliyon.2020.e05652"},{"key":"18687_CR27","unstructured":"Bao H, Dong L, Piao S, Wei F (2022) BEit: BERT pre-training of image transformers. In: International conference on learning representations. https:\/\/openreview.net\/forum?id=p-BhZSz59o4"},{"key":"18687_CR28","doi-asserted-by":"crossref","unstructured":"Ayyar MP, Benois-Pineau J, Zemmari A (2021) Review of white box methods for explanations of convolutional neural networks in image classification tasks. J Electronic Imaging 30(5)","DOI":"10.1117\/1.JEI.30.5.050901"},{"key":"18687_CR29","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: NIPS"},{"key":"18687_CR30","unstructured":"Smilkov D, Thorat N, Kim B, Vi\u00e9gas FB, Wattenberg M (2017) Smoothgrad: removing noise by adding noise, 1\u201310. arXiv:1706.03825"},{"key":"18687_CR31","doi-asserted-by":"crossref","unstructured":"Selvaraju, RR, Cogswell M, Das A, Vedantam R, Parikh D, Batra D (2017) Grad-cam: visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV)","DOI":"10.1109\/ICCV.2017.74"},{"issue":"7","key":"18687_CR32","doi-asserted-by":"publisher","first-page":"0130140","DOI":"10.1371\/journal.pone.0130140","volume":"10","author":"S Bach","year":"2015","unstructured":"Bach S, Binder A, Montavon G, Klauschen F, M\u00fcller K-R, Samek W (2015) On pixel-wise explanations for non-linear classifier decisions by layer-wise relevance propagation. PLoS ONE 10(7):0130140. https:\/\/doi.org\/10.1371\/journal.pone.0130140","journal-title":"PLoS ONE"},{"key":"18687_CR33","doi-asserted-by":"crossref","unstructured":"Chefer H, Gur S, Wolf L (2021) Transformer interpretability beyond attention visualization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 782\u2013791","DOI":"10.1109\/CVPR46437.2021.00084"},{"key":"18687_CR34","unstructured":"Springenberg J, Dosovitskiy A, Brox T, Riedmiller M (2014) Striving for simplicity: the all convolutional net"},{"key":"18687_CR35","unstructured":"Srinivas S, Fleuret F (2019) Full-gradient representation for neural network visualization. In: Advances in neural information processing systems, pp 4126\u20134135. https:\/\/proceedings.neurips.cc\/paper\/2019\/file\/80537a945c7aaa788ccfcdf1b99b5d8f-Paper.pdf"},{"key":"18687_CR36","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1016\/j.patcog.2016.11.008","volume":"65","author":"G Montavon","year":"2017","unstructured":"Montavon G, Lapuschkin S, Binder A, Samek W, M\u00fcller K-R (2017) Explaining nonlinear classification decisions with deep taylor decomposition. Pattern Recogn 65:211\u2013222. https:\/\/doi.org\/10.1016\/j.patcog.2016.11.008","journal-title":"Pattern Recogn"},{"key":"18687_CR37","doi-asserted-by":"crossref","unstructured":"Ribeiro MT, Singh S, Guestrin C (2016) \u201cwhy should I trust you?\u201d: explaining the predictions of any classifier. In: KDD, pp 1135\u20131144. ACM","DOI":"10.1145\/2939672.2939778"},{"key":"18687_CR38","doi-asserted-by":"publisher","unstructured":"Mallick R, Benois-Pineau J, Zemmari A (2022) I saw: a self-attention weighted method for explanation of visual transformers. In: 2022 IEEE International Conference on Image Processing (ICIP), pp 3271\u20133275. https:\/\/doi.org\/10.1109\/ICIP46576.2022.9897347","DOI":"10.1109\/ICIP46576.2022.9897347"},{"key":"18687_CR39","doi-asserted-by":"publisher","unstructured":"Mallick R, Benois-Pineau J, Zemmari A, Yebda T, Pech M, Amieva H, Middleton L (2022) Pooling transformer for detection of risk events in in-the-wild video ego data. In: 2022 26th International Conference on Pattern Recognition (ICPR), pp 2778\u20132784. https:\/\/doi.org\/10.1109\/ICPR56361.2022.9956675","DOI":"10.1109\/ICPR56361.2022.9956675"},{"key":"18687_CR40","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser Lu, Polosukhin I (2017) Attention is all you need. In: Advances in Neural Information Processing Systems, pp 5998\u20136008. https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"18687_CR41","unstructured":"Anguita D, Ghio A, Oneto L, Parra X, Reyes-Ortiz JL (2013) A public domain dataset for human activity recognition using smartphones. In: ESANN"},{"key":"18687_CR42","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2017.502"},{"key":"18687_CR43","doi-asserted-by":"crossref","unstructured":"Yebda T, Benois-Pineau J, Pech M, Amieva H, Middleton L, Bergelt M (2021) Multimodal sensor data analysis for detection of risk situations of fragile people in @home environments. In: MMM (2). Lecture Notes in Computer Science, vol 12573, pp 342\u2013353. Springer","DOI":"10.1007\/978-3-030-67835-7_29"},{"issue":"suppl-2","key":"18687_CR44","doi-asserted-by":"publisher","first-page":"55","DOI":"10.1093\/ageing\/afl088","volume":"35","author":"SR Lord","year":"2006","unstructured":"Lord SR, Menz HB, Sherrington C (2006) Home environment risk factors for falls in older people and the efficacy of home modifications. Age and ageing 35(suppl-2):55\u201359","journal-title":"Age and ageing"},{"key":"18687_CR45","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/JTEHM.2016.2620177","volume":"4","author":"T Pozaic","year":"2016","unstructured":"Pozaic T, Lindemann U, Grebe A-K, Stork W (2016) Sit-to-stand transition reveals acute fall risk in activities of daily living. IEEE J Trans Eng Health Med 4:1\u201311","journal-title":"IEEE J Trans Eng Health Med"},{"key":"18687_CR46","doi-asserted-by":"crossref","unstructured":"Wang X, Girshick R, Gupta A, He K (2018) Non-local neural networks. In: Proceedings of the IEEE conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"18687_CR47","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C (2020) X3d: expanding architectures for efficient video recognition. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 200\u2013210","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"18687_CR48","doi-asserted-by":"crossref","unstructured":"Liu Z, Ning J, Cao Y, Wei Y, Zhang Z, Lin S, Hu H (2021) Video swin transformer. arXiv:2106.13230","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"18687_CR49","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, Hu H, Wei Y, Zhang Z, Lin S, Guo B (2021) Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp 10012\u201310022","DOI":"10.1109\/ICCV48922.2021.00986"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-18687-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-024-18687-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-024-18687-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T07:08:16Z","timestamp":1728630496000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-024-18687-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,11]]},"references-count":49,"journal-issue":{"issue":"35","published-online":{"date-parts":[[2024,10]]}},"alternative-id":["18687"],"URL":"https:\/\/doi.org\/10.1007\/s11042-024-18687-x","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,3,11]]},"assertion":[{"value":"5 June 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 January 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 February 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 March 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no conflict of interest related\/relevant to this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}