{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T20:23:58Z","timestamp":1769199838258,"version":"3.49.0"},"reference-count":12,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,2,18]],"date-time":"2025-02-18T00:00:00Z","timestamp":1739836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,2,18]],"date-time":"2025-02-18T00:00:00Z","timestamp":1739836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,2,18]]},"DOI":"10.1109\/icaiic64266.2025.10920711","type":"proceedings-article","created":{"date-parts":[[2025,3,19]],"date-time":"2025-03-19T18:08:03Z","timestamp":1742407683000},"page":"0820-0824","source":"Crossref","is-referenced-by-count":2,"title":["State Space Model Based VideoMAE Enhancement for Efficient Video Action Classification"],"prefix":"10.1109","author":[{"given":"Junbeom","family":"Moon","sequence":"first","affiliation":[{"name":"School of Computer Science &#x0026; Engineering, Kyungpook National University,Daegu,South Korea"}]},{"given":"Sehwan","family":"Heo","sequence":"additional","affiliation":[{"name":"School of Computer Science &#x0026; Engineering, Kyungpook National University,Daegu,South Korea"}]},{"given":"Jiye","family":"Won","sequence":"additional","affiliation":[{"name":"School of Computer &#x0026; Engineering, Kyungpook National University,Daegu,South Korea"}]},{"given":"Jaeseok","family":"Jang","sequence":"additional","affiliation":[{"name":"School of Computer &#x0026; Engineering, Kyungpook National University,Daegu,South Korea"}]},{"given":"Soon Ki","family":"Jung","sequence":"additional","affiliation":[{"name":"School of Computer Science &#x0026; Engineering, Kyungpook National University,Daegu,South Korea"}]}],"member":"263","reference":[{"key":"ref1","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Alexey","year":"2020","journal-title":"arXiv preprint"},{"key":"ref2","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref3","first-page":"10078","article-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pretraining","volume":"35","author":"Zhan","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref4","article-title":"Masked autoencoders are scalable vision learners","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"Kaiming","year":"2022"},{"key":"ref5","article-title":"Efficiently modeling long sequences with structured state spaces","author":"Albert","year":"2021","journal-title":"arXiv preprint"},{"key":"ref6","first-page":"572","article-title":"Combining recurrent, convolutional, and continuous-time models with linear state space layers","volume":"34","author":"Albert","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref7","article-title":"Vision mamba: Efficient visual representation learning with bidirectional state space model","author":"Lianghui","year":"2024","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Two-stream convolutional networks for action recognition in videos","volume":"27","author":"Karen","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref9","article-title":"Learning spatiotemporal features with 3d convolutional networks","volume-title":"Proceedings of the IEEE international conference on computer vision","author":"Du","year":"2015"},{"key":"ref10","article-title":"Quo vadis, action recognition? a new model and the kinetics dataset","volume-title":"proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Joao","year":"2017"},{"key":"ref11","article-title":"Mamba: Linear-time sequence modeling with selective state spaces","author":"Albert","year":"2023","journal-title":"arXiv preprint"},{"key":"ref12","article-title":"The kinetics human action video dataset","author":"Will","year":"2017","journal-title":"arXiv preprint"}],"event":{"name":"2025 International Conference on Artificial Intelligence in Information and Communication (ICAIIC)","location":"Fukuoka, Japan","start":{"date-parts":[[2025,2,18]]},"end":{"date-parts":[[2025,2,21]]}},"container-title":["2025 International Conference on Artificial Intelligence in Information and Communication (ICAIIC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10919472\/10920631\/10920711.pdf?arnumber=10920711","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,21]],"date-time":"2025-03-21T04:03:43Z","timestamp":1742529823000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10920711\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,18]]},"references-count":12,"URL":"https:\/\/doi.org\/10.1109\/icaiic64266.2025.10920711","relation":{},"subject":[],"published":{"date-parts":[[2025,2,18]]}}}