{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T08:38:52Z","timestamp":1771922332868,"version":"3.50.1"},"reference-count":72,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccvw69036.2025.00281","type":"proceedings-article","created":{"date-parts":[[2026,2,23]],"date-time":"2026-02-23T20:44:02Z","timestamp":1771879442000},"page":"2700-2710","source":"Crossref","is-referenced-by-count":0,"title":["Learning Robust Aligned Representations Across Multiple Visual Modalities in Human Action Recognition"],"prefix":"10.1109","author":[{"given":"David J.","family":"Lerch","sequence":"first","affiliation":[{"name":"Fraunhofer IOSB,Karlsruhe,Germany"}]},{"given":"Bastian","family":"Rothenburger","sequence":"additional","affiliation":[{"name":"Goethe University Frankfurt,Frankfurt,Germany"}]},{"given":"Zeyun","family":"Zhong","sequence":"additional","affiliation":[{"name":"Fraunhofer IOSB,Karlsruhe,Germany"}]},{"given":"Manuel","family":"Martin","sequence":"additional","affiliation":[{"name":"Fraunhofer IOSB,Karlsruhe,Germany"}]},{"given":"Frederik","family":"Diederichs","sequence":"additional","affiliation":[{"name":"Fraunhofer IOSB,Karlsruhe,Germany"}]},{"given":"Rainer","family":"Stiefelhagen","sequence":"additional","affiliation":[{"name":"Karlsruhe Institute of Technology (KIT),Karlsruhe,Germany"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00333"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.cogsys.2018.04.002"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2102.05095"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2769085"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3185058"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19809-0_11"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01955"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00553"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21236\/ADA623249"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2015.7298714"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-92659-5_17"},{"key":"ref14","article-title":"Clip2video: Mastering video-text retrieval via image clip","author":"Fang","year":"2021","journal-title":"arXiv preprint"},{"issue":"10","key":"ref15","first-page":"2581","article-title":"Learning with privileged information via adversarial discriminative modality distillation","volume":"42","author":"Nuno","year":"2019","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00033"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01563"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.70711"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3410301"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00548"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-005-1838-7"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00331"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC58415.2024.10920160"},{"key":"ref25","first-page":"211","article-title":"Unsupervised 3D skeleton-based action recognition using cross-attention with conditioned generation capabilities","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"David","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.537"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_23"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02484"},{"key":"ref29","volume-title":"LLaVA: Large language and vision assistant","author":"Liu","year":"2023"},{"key":"ref30","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in neural information processing systems"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.391"},{"key":"ref32","first-page":"10","article-title":"Skepxels: Spatio-temporal image representation of human skeleton joints for action recognition","volume-title":"CVPR workshops","author":"Liu","year":"2019"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3521749"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3086590"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00934"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00289"},{"issue":"4","key":"ref39","first-page":"43","article-title":"A new technique based on 3d convolutional neural networks and filtering optical flow maps for action classification in infrared video","volume":"21","author":"Meglouli","year":"2019","journal-title":"Journal of Control Engineering and Applied Informatics"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.29007\/1mjd"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/IROS47612.2022.9981445"},{"key":"ref42","article-title":"Video-based human action recognition using deep learning: a review","author":"Hieu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref43","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.621"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-019-08576-z"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2025.104389"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-022-12091-z"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.115"},{"key":"ref49","article-title":"Star: Sparse transformer-based action recognition","author":"Shi","year":"2021","journal-title":"arXiv preprint"},{"key":"ref50","article-title":"A survey on human action recognition","author":"SHUchang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref51","article-title":"Two-stream convolutional networks for action recognition in videos","volume":"27","author":"Simonyan","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2967577"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.207"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612449"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3183112"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.3390\/s20113305"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2017.123"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12228"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02206"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73347-5_7"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW59549.2023.00045"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2018.12.050"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111902"},{"key":"ref67","article-title":"CLIP-vip: Adapting pre-trained image-text model to video-language alignment","volume-title":"The Eleventh International Conference on Learning Repre-sentations","author":"Xue","year":"2023"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"ref69","article-title":"Aim: Adapting image models for efficient video action recognition","author":"Yang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475473"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25495"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision Workshops (ICCVW)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,20]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision Workshops (ICCVW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11373940\/11374285\/11375285.pdf?arnumber=11375285","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T07:34:08Z","timestamp":1771918448000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11375285\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":72,"URL":"https:\/\/doi.org\/10.1109\/iccvw69036.2025.00281","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}