{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T17:17:22Z","timestamp":1771521442056,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,9,22]],"date-time":"2023-09-22T00:00:00Z","timestamp":1695340800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,9,22]]},"DOI":"10.1145\/3641584.3641646","type":"proceedings-article","created":{"date-parts":[[2024,6,14]],"date-time":"2024-06-14T22:44:43Z","timestamp":1718405083000},"page":"421-426","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["STDN: A SpatioTemporal Difference Network for Video Action Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8773-3148","authenticated-orcid":false,"given":"Zhao","family":"Guo","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Xi'an University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9516-1947","authenticated-orcid":false,"given":"Yuelei","family":"Xiao","sequence":"additional","affiliation":[{"name":"School of Modern Posts, Xi'an University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4079-4881","authenticated-orcid":false,"given":"yi Jun","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xi'an University of Posts and Telecommunications, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-2511-4586","authenticated-orcid":false,"given":"Cheng","family":"Fan","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, Xi'an University of Posts and Telecommunications, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,14]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings, Part III. Cham: Springer Nature Switzerland","author":"Xiang W","year":"2022","unstructured":"Xiang W, Li C, Wang B, Spatiotemporal Self-attention Modeling with Temporal Patch Shift for Action Recognition[C]\/\/Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part III. Cham: Springer Nature Switzerland, 2022: 627-644."},{"key":"e_1_3_2_1_2_1","volume-title":"Multi-scale Motion-Aware Module for Video Action Recognition[C]\/\/European Conference on Computer Vision","author":"Peng H W","year":"2023","unstructured":"Peng H W, Tseng Y C. Multi-scale Motion-Aware Module for Video Action Recognition[C]\/\/European Conference on Computer Vision. Springer, Cham, 2023: 589-606."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Long F Qiu Z Pan Y Stand-alone inter-frame attention in video models[C]\/\/Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2022: 3192-3201.","DOI":"10.1109\/CVPR52688.2022.00319"},{"key":"e_1_3_2_1_4_1","author":"Xie Z","year":"2022","unstructured":"Xie Z, Chen J, Wu K, Global Temporal Difference Network for Action Recognition[J]. IEEE Transactions on Multimedia, 2022.","journal-title":"IEEE Transactions on Multimedia"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2022.103484"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Ahn D Kim S Hong H STAR-Transformer: A Spatio-temporal Cross Attention Transformer for Human Action Recognition[C]\/\/Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision. 2023: 3330-3339.","DOI":"10.1109\/WACV56688.2023.00333"},{"key":"e_1_3_2_1_7_1","first-page":"1","article-title":"an action recognition network with enhanced spatio-temporal information[J]","volume":"2023","author":"Jiang Z Y","unstructured":"Jiang Z Y, Zhang Y, Hu S. ESTI: an action recognition network with enhanced spatio-temporal information[J]. International Journal of Machine Learning and Cybernetics, 2023: 1-12.","journal-title":"International Journal of Machine Learning and Cybernetics"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2021:  1895-1904","author":"Wang L","unstructured":"Wang L, Tong Z, Ji B, Tdn: Temporal difference networks for efficient action recognition. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2021: 1895-1904."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01661-1"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3180585"},{"key":"e_1_3_2_1_11_1","volume-title":"Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems","author":"Simonyan K","year":"2014","unstructured":"Simonyan K, Zisserman A. Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems, 2014, 27."},{"key":"e_1_3_2_1_12_1","volume-title":"proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2017: 6299-6308","author":"Carreira J","unstructured":"Carreira J, Zisserman A. Quo vadis, action recognition? a new model and the kinetics dataset. proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2017: 6299-6308."},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2019:  12026-12035","author":"Shi L","unstructured":"Shi L, Zhang Y, Cheng J, Two-stream adaptive graph convolutional networks for skeleton-based action recognition. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2019: 12026-12035."},{"key":"e_1_3_2_1_14_1","volume-title":"Temporal segment networks: Towards good practices for deep action recognition. European conference on computer vision","author":"Wang L","year":"2016","unstructured":"Wang L, Xiong Y, Wang Z, Temporal segment networks: Towards good practices for deep action recognition. European conference on computer vision. Springer, Cham, 2016: 20-36."},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the European conference on computer vision (ECCV). 2018: 803-818","author":"Zhou B","unstructured":"Zhou B, Andonian A, Oliva A, Temporal relational reasoning in videos. Proceedings of the European conference on computer vision (ECCV). 2018: 803-818."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision. 2019:  7083-7093","author":"Lin J","unstructured":"Lin J, Gan C, Han S. Tsm: Temporal shift module for efficient video understanding. Proceedings of the IEEE\/CVF international conference on computer vision. 2019: 7083-7093."},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision. 2019:  6202-6211","author":"Feichtenhofer C","unstructured":"Feichtenhofer C, Fan H, Malik J, Slowfast networks for video recognition. Proceedings of the IEEE\/CVF international conference on computer vision. 2019: 6202-6211.."},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV). 2018: 387-403","author":"Lee M","unstructured":"Lee M, Lee S, Son S, Motion feature network: Fixed motion filter for action recognition. Proceedings of the European Conference on Computer Vision (ECCV). 2018: 387-403."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6836"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2020:  909-918","author":"Li Y","unstructured":"Li Y, Ji B, Shi X, Tea: Temporal excitation and aggregation for action recognition. Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2020: 909-918."},{"key":"e_1_3_2_1_21_1","volume-title":"proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2017: 6299-6308","author":"Carreira J","unstructured":"Carreira J, Zisserman A. Quo vadis, action recognition? a new model and the kinetics dataset. proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2017: 6299-6308."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision. 2019:  6202-6211","author":"Feichtenhofer C","unstructured":"Feichtenhofer C, Fan H, Malik J, Slowfast networks for video recognition. Proceedings of the IEEE\/CVF international conference on computer vision. 2019: 6202-6211."},{"key":"e_1_3_2_1_23_1","volume-title":"proceedings of the IEEE International Conference on Computer Vision. 2017: 5533-5541","author":"Qiu Z","unstructured":"Qiu Z, Yao T, Mei T. Learning spatio-temporal representation with pseudo-3d residual networks. proceedings of the IEEE International Conference on Computer Vision. 2017: 5533-5541."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the IEEE conference on Computer Vision and Pattern Recognition. 2018:  6450-6459","author":"Tran D","unstructured":"Tran D, Wang H, Torresani L, A closer look at spatiotemporal convolutions for action recognition. Proceedings of the IEEE conference on Computer Vision and Pattern Recognition. 2018: 6450-6459."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the European conference on computer vision (ECCV). 2018: 305-321","author":"Xie S","unstructured":"Xie S, Sun C, Huang J, Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. Proceedings of the European conference on computer vision (ECCV). 2018: 305-321."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Goyal R Ebrahimi Kahou S Michalski V The\" something something\" video database for learning and evaluating visual common sense[C]\/\/Proceedings of the IEEE international conference on computer vision. 2017: 5842-5850.","DOI":"10.1109\/ICCV.2017.622"},{"key":"e_1_3_2_1_27_1","volume-title":"Gan W","author":"Jiang B","year":"2019","unstructured":"Jiang B, Wang M M, Gan W, Stm: Spatiotemporal and motion encoding for action recognition[C]\/\/Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2019: 2000-2009."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Luo C Yuille A L. Grouped spatial-temporal aggregation for efficient action recognition[C]\/\/Proceedings of the IEEE\/CVF International Conference on Computer Vision. 2019: 5512-5521.","DOI":"10.1109\/ICCV.2019.00561"}],"event":{"name":"AIPR 2023: 2023 6th International Conference on Artificial Intelligence and Pattern Recognition","location":"Xiamen China","acronym":"AIPR 2023"},"container-title":["2023 6th International Conference on Artificial Intelligence and Pattern Recognition (AIPR)"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641584.3641646","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3641584.3641646","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:30Z","timestamp":1750295850000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3641584.3641646"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9,22]]},"references-count":28,"alternative-id":["10.1145\/3641584.3641646","10.1145\/3641584"],"URL":"https:\/\/doi.org\/10.1145\/3641584.3641646","relation":{},"subject":[],"published":{"date-parts":[[2023,9,22]]},"assertion":[{"value":"2024-06-14","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}