{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T22:38:40Z","timestamp":1757630320560,"version":"3.44.0"},"reference-count":75,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,8,11]],"date-time":"2025-08-11T00:00:00Z","timestamp":1754870400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,8,11]],"date-time":"2025-08-11T00:00:00Z","timestamp":1754870400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,8,11]]},"DOI":"10.1109\/avss65446.2025.11149978","type":"proceedings-article","created":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T17:30:18Z","timestamp":1757439018000},"page":"1-9","source":"Crossref","is-referenced-by-count":0,"title":["Are Attention Maps Richer than we Imagined for Action Recognition?"],"prefix":"10.1109","author":[{"given":"Tanay","family":"Agrawal","sequence":"first","affiliation":[{"name":"INRIA Sophia Antipolis &#x2013; M&#x00E9;diterran&#x00E9;e,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abid","family":"Ali","sequence":"additional","affiliation":[{"name":"INRIA Sophia Antipolis &#x2013; M&#x00E9;diterran&#x00E9;e,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Antitza","family":"Dantcheva","sequence":"additional","affiliation":[{"name":"INRIA Sophia Antipolis &#x2013; M&#x00E9;diterran&#x00E9;e,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Francois","family":"Bremond","sequence":"additional","affiliation":[{"name":"INRIA Sophia Antipolis &#x2013; M&#x00E9;diterran&#x00E9;e,France"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2102.05095"},{"key":"ref3","article-title":"On the opportunities and risks of foundation models","author":"Bommasani","year":"2021"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3177813"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref6","article-title":"Adaptformer: Adapting vision transformers for scalable visual recognition","author":"Chen","year":"2022","journal-title":"NeurIPS"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.3390\/s21031005"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00092"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3127885"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58545-7_5"},{"key":"ref11","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2019.00630"},{"key":"ref15","author":"Fu","year":"2021","journal-title":"VIOLET: End-to-End Video-Language Transformers with Masked Visual-token Modeling"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_13"},{"key":"ref19","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume-title":"Proceedings of the 36th International Conference on Machine Learning, volume 97 of Proceedings of Machine Learning Research","author":"Houlsby"},{"key":"ref20","first-page":"2790","article-title":"Parameter-efficient transfer learning for nlp","volume-title":"International Conference on Machine Learning","author":"Houlsby"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.3233\/faia240489"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"ref24","first-page":"1022","article-title":"Compacter: Efficient low-rank hypercomplex adapter layers","volume":"34","author":"Karimi Mahabadi","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.47"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2014.223"},{"key":"ref27","article-title":"Transformers are rnns: Fast autoregressive transformers with linear attention","volume-title":"Proceedings of the International Conference on Machine Learning (ICML)","author":"Katharopoulos"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.5555\/2999134.2999257"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00157"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3282631"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02214"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247822"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_23"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01345"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00022"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_12"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_1"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2023.3327605\/mm1"},{"key":"ref46","author":"Oquab","year":"2023","journal-title":"Dinov2: Learning robust visual features without supervision"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00053"},{"key":"ref48","first-page":"26462","article-title":"St-adapter: Parameter-efficient image-to-video transfer learning","volume":"35","author":"Pan","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00219"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.39"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01018"},{"key":"ref52","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref53","article-title":"Hiera: A hierarchical vision transformer without the bells-and-whistles","author":"Ryali","year":"2023","journal-title":"ICML"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_39"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01230"},{"key":"ref57","first-page":"27","article-title":"Two-stream convolutional networks for action recognition in videos","author":"Simonyan","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"ref59","article-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training","author":"Tong","year":"2022","journal-title":"Neural Information Processing Systems"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref62","first-page":"5696","article-title":"Omnivl: One foundation model for image-language and video-language tasks","volume":"35","author":"Wang","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2868668"},{"key":"ref65","author":"Wang","year":"2022","journal-title":"Internvideo: General video foundation models via generative and discriminative learning"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"ref67","author":"Xue","year":"2022","journal-title":"Clip-vip: Adapting pre-trained image-text model to video-language representation alignment"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.5244\/C.35.4"},{"key":"ref71","article-title":"Coca: Contrastive captioners are image-text foundation models","author":"Yu","year":"2022"},{"key":"ref72","article-title":"Florence: A new foundation model for computer vision","author":"Yuan","year":"2021"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"ref74","article-title":"Co-training transformer with videos and images improves action recognition","author":"Zhang","year":"2021"},{"key":"ref75","article-title":"Co-training transformer with videos and images improves action recognition","author":"Zhang","year":"2021"}],"event":{"name":"2025 IEEE International Conference on Advanced Visual and Signal-Based Systems (AVSS)","location":"Tainan, Taiwan","start":{"date-parts":[[2025,8,11]]},"end":{"date-parts":[[2025,8,13]]}},"container-title":["2025 IEEE International Conference on Advanced Visual and Signal-Based Systems (AVSS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11149666\/11149641\/11149978.pdf?arnumber=11149978","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T17:41:16Z","timestamp":1757526076000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11149978\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,11]]},"references-count":75,"URL":"https:\/\/doi.org\/10.1109\/avss65446.2025.11149978","relation":{},"subject":[],"published":{"date-parts":[[2025,8,11]]}}}