{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T06:21:28Z","timestamp":1776838888720,"version":"3.51.2"},"reference-count":26,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1109\/icip51287.2024.10648200","type":"proceedings-article","created":{"date-parts":[[2024,9,27]],"date-time":"2024-09-27T18:34:45Z","timestamp":1727462085000},"page":"332-338","source":"Crossref","is-referenced-by-count":2,"title":["MVAFormer: RGB-Based Multi-View Spatio-Temporal Action Recognition with Transformer"],"prefix":"10.1109","author":[{"given":"Taiga","family":"Yamane","sequence":"first","affiliation":[{"name":"NTT Corporation,NTT Human Informatics Laboratories,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Satoshi","family":"Suzuki","sequence":"additional","affiliation":[{"name":"NTT Corporation,NTT Human Informatics Laboratories,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ryo","family":"Masumura","sequence":"additional","affiliation":[{"name":"NTT Corporation,NTT Human Informatics Laboratories,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shotaro","family":"Tora","sequence":"additional","affiliation":[{"name":"NTT Corporation,NTT Human Informatics Laboratories,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"ref5","article-title":"The ava-kinetics localized human actions video dataset","author":"Li","year":"2020","journal-title":"arXiv preprint arXiv:2005.00214"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref7","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2021","journal-title":"ICLR"},{"key":"ref8","article-title":"VideoMAE: Masked autoencoders are data-efficient learners for self-supervised video pre-training","author":"Tong","year":"2022","journal-title":"NeurIPS"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00875"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.115"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01238"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01240-3_28"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746006"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58571-6_1"},{"key":"ref16","article-title":"Detr3d: 3d object detection from multi-view images via 3d-to-2d queries","author":"Wang","year":"2022","journal-title":"CoRL"},{"key":"ref17","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"NIPS"},{"key":"ref18","article-title":"A comprehensive study of deep video action recognition","author":"Zhu","year":"2020","journal-title":"arXiv preprint arXiv:2012.06567"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2008.2005594"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00053"},{"key":"ref22","article-title":"Relation modeling in spatio-temporal action localization","author":"Feng","year":"2021","journal-title":"arXiv preprint arXiv:2106.08061"},{"key":"ref23","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","author":"Ren","year":"2015","journal-title":"NIPS"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref26","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2018","journal-title":"ICLR"}],"event":{"name":"2024 IEEE International Conference on Image Processing (ICIP)","location":"Abu Dhabi, United Arab Emirates","start":{"date-parts":[[2024,10,27]]},"end":{"date-parts":[[2024,10,30]]}},"container-title":["2024 IEEE International Conference on Image Processing (ICIP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10647221\/10647122\/10648200.pdf?arnumber=10648200","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,28]],"date-time":"2024-09-28T05:25:18Z","timestamp":1727501118000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10648200\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":26,"URL":"https:\/\/doi.org\/10.1109\/icip51287.2024.10648200","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]}}}