{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:51Z","timestamp":1781539071753,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810614","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1102-1110","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["STeP-Net: A Spatio-Temporal Perception Network for Action Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8631-4240","authenticated-orcid":false,"given":"Kunfang","family":"Song","sequence":"first","affiliation":[{"name":"School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3640-470X","authenticated-orcid":false,"given":"Guowei","family":"Yan","sequence":"additional","affiliation":[{"name":"School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3796-6948","authenticated-orcid":false,"given":"Jiaqing","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6445-7656","authenticated-orcid":false,"given":"Shufen","family":"Ruan","sequence":"additional","affiliation":[{"name":"School of Mathematics and Statistics, Wuhan Textile University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3416-4356","authenticated-orcid":false,"given":"Yanwen","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_3_1_3_2","unstructured":"Alexey Bochkovskiy Chien-Yao Wang and Hong-Yuan\u00a0Mark Liao. 2020. Yolov4: Optimal speed and accuracy of object detection. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2004.10934 (2020)."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00953"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Lei Chen Zhan Tong Yibing Song Gangshan Wu and Limin Wang. 2025. Cycleacr: Cycle modeling of actor-context relations for video action detection. IEEE Transactions on Pattern Analysis and Machine Intelligence (2025).","DOI":"10.1109\/TPAMI.2025.3595393"},{"key":"e_1_3_3_1_7_2","unstructured":"Lu Chi Borui Jiang and Yadong Mu. 2020. Fast fourier convolution. Advances in Neural Information Processing Systems 33 (2020) 4479\u20134488."},{"key":"e_1_3_3_1_8_2","unstructured":"Duc Manh\u00a0Nguyen Dang Viet\u00a0Hang Duong Jia\u00a0Ching Wang and Nhan\u00a0Bui Duc. 2024. YOWOv3: An efficient and generalized framework for human action detection and recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.02623 (2024)."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Naz D\u00fcndar Ali\u00a0Seydi Ke\u00e7eli Ayd\u0131n Kaya and Hayri Sever. 2024. A shallow 3D convolutional neural network for violence detection in videos. Egyptian Informatics Journal 26 (2024) 100455.","DOI":"10.1016\/j.eij.2024.100455"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298676"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00633"},{"key":"e_1_3_3_1_16_2","unstructured":"Yining Hong Beide Liu Maxine Wu Yuanhao Zhai Kai-Wei Chang Linjie Li Kevin Lin Chung-Ching Lin Jianfeng Wang Zhengyuan Yang et\u00a0al. 2024. Slowfast-vgen: Slow-fast learning for action-driven long video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.23277 (2024)."},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.620"},{"key":"e_1_3_3_1_18_2","first-page":"33","volume-title":"International Conference on Intelligent Robotics and Applications","author":"Jiang Zhiqiang","year":"2024","unstructured":"Zhiqiang Jiang, Jianhua Yang, Nan Jiang, Shuaiyan Liu, Tao Xie, Lijun Zhao, and Ruifeng Li. 2024. YOWOv2: A stronger yet efficient multi-level detection framework for real-time spatio-temporal action detection. In International Conference on Intelligent Robotics and Applications. Springer, 33\u201348."},{"key":"e_1_3_3_1_19_2","unstructured":"Okan K\u00f6p\u00fckl\u00fc Xiangyu Wei and Gerhard Rigoll. 2019. You only watch once: A unified cnn architecture for real-time spatiotemporal action localization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1911.06644 (2019)."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Seok\u00a0Hwan Lee Taein Son Soo\u00a0Won Seo Jisong Kim and Jun\u00a0Won Choi. 2024. JARViS: Detecting actions in video using unified actor-scene context relation modeling. Neurocomputing 610 (2024) 128616.","DOI":"10.1016\/j.neucom.2024.128616"},{"key":"e_1_3_3_1_21_2","unstructured":"Mengqi Lei Siqi Li Yihong Wu Han Hu You Zhou Xinhu Zheng Guiguang Ding Shaoyi Du Zongze Wu and Yue Gao. 2025. Yolov13: Real-time object detection with hypergraph-enhanced adaptive visual perception. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.17733 (2025)."},{"key":"e_1_3_3_1_22_2","unstructured":"Xunsong Li Pengzhan Sun Yangcen Liu Lixin Duan and Wen Li. 2025. Simultaneous detection and interaction reasoning for object-centric action recognition. IEEE Transactions on Multimedia (2025)."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_5"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW67362.2025.00247"},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Yu Liu Fan Yang and Dominique Ginhac. 2023. Accumulated micro-motion representations for lightweight online action detection in real-time. Journal of Visual Communication and Image Representation 95 (2023) 103879.","DOI":"10.1016\/j.jvcir.2023.103879"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_3_1_27_2","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.05101 (2017)."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Muhammad Munsif Noman Khan Altaf Hussain Min\u00a0Je Kim and Sung\u00a0Wook Baik. 2024. Darkness-adaptive action recognition: Leveraging efficient tubelet slow-fast network for industrial applications. IEEE Transactions on Industrial Informatics (2024).","DOI":"10.1109\/TII.2024.3431070"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00232"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"crossref","unstructured":"Bo Pang Gao Peng Yizhuo Li and Cewu Lu. 2024. Markov Progressive Framework a Universal Paradigm for Modeling Long Videos. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024).","DOI":"10.1109\/TPAMI.2024.3426998"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Yunzhong Si Huiying Xu Xinzhong Zhu Wenhao Zhang Yao Dong Yuxing Chen and Hongbo Li. 2025. SCSA: Exploring the synergistic effects between spatial and channel attention. Neurocomputing 634 (2025) 129866.","DOI":"10.1016\/j.neucom.2025.129866"},{"key":"e_1_3_3_1_32_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i7.32745"},{"key":"e_1_3_3_1_34_2","unstructured":"Khurram Soomro Amir\u00a0Roshan Zamir and Mubarak Shah. 2012. Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1212.0402 (2012)."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01229"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00594"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_20"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_5"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"crossref","unstructured":"Zhan Tong Yibing Song Jue Wang and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems 35 (2022) 10078\u201310093.","DOI":"10.52202\/068431-0732"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_3_1_41_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"crossref","unstructured":"Peng Wang Fanwei Zeng and Yuntao Qian. 2024. A survey on deep learning-based spatio-temporal action detection. International Journal of Wavelets Multiresolution and Information Processing 22 04 (2024) 2350066.","DOI":"10.1142\/S0219691323500662"},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"crossref","unstructured":"Wenju Wang Zehua Gu Bang Tang Sen Wang and Jianfei Hao. 2025. ACSF-ED: Adaptive Cross-Scale Fusion Encoder-Decoder for Spatio-Temporal Action Detection.Computers Materials & Continua 82 2 (2025).","DOI":"10.32604\/cmc.2024.057392"},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00037"},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Yuecong Xu Haozhi Cao Jianxiong Yin Zhenghua Chen Xiaoli Li Zhengguo Li Qianwen Xu and Jianfei Yang. 2024. Going deeper into recognizing actions in dark environments: A comprehensive benchmark study. International Journal of Computer Vision 132 4 (2024) 1292\u20131309.","DOI":"10.1007\/s11263-023-01932-5"},{"key":"e_1_3_3_1_46_2","unstructured":"Jianhua Yang. 2022. Yowo-plus: An incremental improvement. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.11219 (2022)."},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"crossref","unstructured":"Rui Yang Hui Zhang Mulan Qiu and Min Wang. 2025. MLSTIF: multi-level spatio-temporal and human-object interaction feature fusion network for spatio-temporal action detection. Multimedia Systems 31 3 (2025) 1\u201319.","DOI":"10.1007\/s00530-025-01796-4"},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00035"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01323"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00233"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:48:31Z","timestamp":1781538511000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810614"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":49,"alternative-id":["10.1145\/3805622.3810614","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810614","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}