{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:50:58Z","timestamp":1765309858027,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","funder":[{"name":"Frontier Leading Technology Basic Research Program of Jiangsu Province","award":["BK20202001"],"award-info":[{"award-number":["BK20202001"]}]},{"name":"Major Sports Research Program of Jiangsu Sports Bureau","award":["ST241104"],"award-info":[{"award-number":["ST241104"]}]},{"name":"Postgraduate Research & Practice Innovation Program of Jiangsu Province","award":["SJCX24_0285"],"award-info":[{"award-number":["SJCX24_0285"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754807","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:54:17Z","timestamp":1761375257000},"page":"9100-9109","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["RSFomer: Time Series Transformer for Robust Sports Action Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1388-6771","authenticated-orcid":false,"given":"Yongan","family":"Guo","sequence":"first","affiliation":[{"name":"Nanjing University of Posts and Telecommunications, NanJing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0763-6841","authenticated-orcid":false,"given":"Zhongyan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Nanjing University of Posts and Telecommunications, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5553-4278","authenticated-orcid":false,"given":"Yuao","family":"Wang","sequence":"additional","affiliation":[{"name":"Nanjing University of Posts and Telecommunications, NanJing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6863-0933","authenticated-orcid":false,"given":"Na","family":"Zhu","sequence":"additional","affiliation":[{"name":"Jiangsu Research Institute of Sports Science, NanJing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7353-4159","authenticated-orcid":false,"given":"Xuyun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Macquarie University, Sydney, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7118-1038","authenticated-orcid":false,"given":"Hongwang","family":"Xiao","sequence":"additional","affiliation":[{"name":"Beijing Academy of Artificial Intelligence, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6712-3465","authenticated-orcid":false,"given":"Yuan","family":"Miao","sequence":"additional","affiliation":[{"name":"Victoria University, Melbourne, VIC, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3226-388X","authenticated-orcid":false,"given":"Bo","family":"Li","sequence":"additional","affiliation":[{"name":"Victoria University, Melbourne, VIC, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Jason Lines, Michael Flynn, James Large, Aaron Bostrom, Paul Southam, and Eamonn Keogh.","author":"Bagnall Anthony","year":"2018","unstructured":"Anthony Bagnall, Hoang Anh Dau, Jason Lines, Michael Flynn, James Large, Aaron Bostrom, Paul Southam, and Eamonn Keogh. 2018. The textUEA multivariate time series classification archive, 2018. arXiv preprint arXiv:1811.00075 (2018)."},{"key":"e_1_3_2_1_2_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 7291-7299","author":"Cao Zhe","year":"2017","unstructured":"Zhe Cao, Tomas Simon, Shih-En Wei, and Yaser Sheikh. 2017. Realtime multi-person text2D pose estimation using part affinity fields. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 7291-7299."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 6299-6308","author":"Carreira Joao","year":"2017","unstructured":"Joao Carreira and Andrew Zisserman. 2017. Quo vadis, action recognition? textA new model and the kinetics dataset. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 6299-6308."},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the International Joint Conference on Artificial Intelligence. 2285-2291","author":"Chen Zipeng","year":"2021","unstructured":"Zipeng Chen, Qianli Ma, and Zhenxi Lin. 2021. Time-aware multi-scale textRNNs for time series modeling.. In Proceedings of the International Joint Conference on Artificial Intelligence. 2285-2291."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583205"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539329"},{"key":"e_1_3_2_1_7_1","unstructured":"MMPose Contributors. 2020. textOpenMMLab Pose Estimation Toolbox and Benchmark. https:\/\/github.com\/open-mmlab\/mmpose."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2013.02.030"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2023.119009"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 1110-1118","author":"Du Yong","year":"2015","unstructured":"Yong Du, Wei Wang, and Liang Wang. 2015. Hierarchical recurrent neural network for skeleton based action recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 1110-1118."},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision. 6202-6211","author":"Feichtenhofer Christoph","year":"2019","unstructured":"Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He. 2019. textSlowFast networks for video recognition. In Proceedings of the IEEE\/CVF international conference on computer vision. 6202-6211."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10618-023-00948-2"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00033"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.244"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2007.70711"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053928"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/JBHI.2025.3584916"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1186\/s40798-020-0237-5"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2022.109494"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2022.09.009"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2019.04.014"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10618-017-0495-0"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2893318"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671862"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.dss.2011.12.014"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ymssp.2019.03.013"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2020.2995870"},{"key":"e_1_3_2_1_31_1","first-page":"8375","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"35","author":"Li Guozhong","year":"2021","unstructured":"Guozhong Li, Byron Choi, Jianliang Xu, Sourav S Bhowmick, Kwok-Pan Chun, and Grace Lai-Hung Wong. 2021a. textShapeNet: textA shapelet-neural network approach for multivariate time series classification. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 35. 8375-8383."},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3383-3393","author":"Li Jiefeng","year":"2021","unstructured":"Jiefeng Li, Chao Xu, Zhicun Chen, Siyuan Bian, Lixin Yang, and Cewu Lu. 2021b. textHybrIK: A hybrid analytical-neural inverse kinematics solution for text3D human pose and shape estimation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 3383-3393."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.2297321"},{"key":"e_1_3_2_1_34_1","volume-title":"textTodyNet: Temporal dynamic graph neural network for multivariate time series classification. Information Sciences","author":"Liu Huaiyuan","year":"2024","unstructured":"Huaiyuan Liu, Donghua Yang, Xianzhang Liu, Xinglei Chen, Zhiyu Liang, Hongzhi Wang, Yong Cui, and Jun Gu. 2024. textTodyNet: Temporal dynamic graph neural network for multivariate time series classification. Information Sciences (2024), 120914."},{"key":"e_1_3_2_1_35_1","volume-title":"Gated transformer networks for multivariate time series classification. arXiv preprint arXiv:2103.14438","author":"Liu Minghao","year":"2021","unstructured":"Minghao Liu, Shengqi Ren, Siyuan Ma, Jiahui Jiao, Yizhou Chen, Zhiguang Wang, and Wei Song. 2021. Gated transformer networks for multivariate time series classification. arXiv preprint arXiv:2103.14438 (2021)."},{"key":"e_1_3_2_1_36_1","first-page":"2752","article-title":"Multi-task deep learning for real-time 3D human pose estimation and action recognition","volume":"43","author":"Luvizon Diogo C","year":"2020","unstructured":"Diogo C Luvizon, David Picard, and Hedi Tabia. 2020. Multi-task deep learning for real-time 3D human pose estimation and action recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 43, 8 (2020), 2752-2764.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.3390\/app11104426"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-021-06057-9"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i18.34155"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.3390\/e25060844"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.03.013"},{"key":"e_1_3_2_1_42_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Shao Dian","year":"2020","unstructured":"Dian Shao, Yue Zhao, Bo Dai, and Dahua Lin. 2020. textFineGym: A Hierarchical Video Dataset for Fine-grained Action Understanding. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-023-01835-4"},{"key":"e_1_3_2_1_44_1","volume-title":"Advances in Neural Information Processing Systems","volume":"27","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. Advances in Neural Information Processing Systems, Vol. 27 (2014)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10618-022-00895-4"},{"key":"e_1_3_2_1_46_1","first-page":"6827","article-title":"Development of fully convolutional neural networks based on discretization in time series classification","volume":"35","author":"Tahan Marzieh Hajizadeh","year":"2022","unstructured":"Marzieh Hajizadeh Tahan, Mohammad Ghasemzadeh, and Shahrokh Asadi. 2022. Development of fully convolutional neural networks based on discretization in time series classification. IEEE Transactions on Knowledge and Data Engineering, Vol. 35, 7 (2022), 6827-6838.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia. 1514-1523","author":"Tang Tao","year":"2024","unstructured":"Tao Tang, Hong Liu, Yingxuan You, Ti Wang, and Wenhao Li. 2024. textARTS: Semi-analytical regressor using disentangled skeletal representations for human mesh recovery from videos. In Proceedings of the 32nd ACM International Conference on Multimedia. 1514-1523."},{"key":"e_1_3_2_1_48_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.460"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.82"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"e_1_3_2_1_53_1","first-page":"951","article-title":"textTac-Trainer: A visual analytics system for textIoT-based racket sports training","volume":"29","author":"Wang Jiachen","year":"2022","unstructured":"Jiachen Wang, Ji Ma, Kangping Hu, Zheng Zhou, Hui Zhang, Xiao Xie, and Yingcai Wu. 2022. textTac-Trainer: A visual analytics system for textIoT-based racket sports training. IEEE Transactions on Visualization and Computer Graphics, Vol. 29, 1 (2022), 951-961.","journal-title":"IEEE Transactions on Visualization and Computer Graphics"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3580305.3599549"},{"key":"e_1_3_2_1_55_1","volume-title":"Temporal segment networks for action recognition in videos","author":"Wang Limin","year":"2018","unstructured":"Limin Wang, Yuanjun Xiong, Zhe Wang, Yu Qiao, Dahua Lin, Xiaoou Tang, and Luc Van Gool. 2018. Temporal segment networks for action recognition in videos. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 11 (2018), 2740-2755."},{"key":"e_1_3_2_1_56_1","volume-title":"IOP conference series: materials science and engineering","volume":"569","author":"Wang Xianyuan","year":"2019","unstructured":"Xianyuan Wang, Zhenjiang Miao, Ruyi Zhang, and Shanshan Hao. 2019. textI3D-LS\u2122: A new model for human action recognition. In IOP conference series: materials science and engineering, Vol. 569. IOP Publishing, 032035."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3232034"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2015.12.041"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2024.109177"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.124591"},{"key":"e_1_3_2_1_62_1","volume-title":"Proceedings of the 32nd ACM International Conference on Multimedia. 1672-1681","author":"Zhang Jinyan","year":"2024","unstructured":"Jinyan Zhang, Mengyuan Liu, Hong Liu, Guoquan Wang, and Wenhao Li. 2024b. textAPP: Adaptive Pose Pooling for text3D Human Pose Estimation from Videos. In Proceedings of the 32nd ACM International Conference on Multimedia. 1672-1681."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2802648"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2023.01.093"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 11656-11665","author":"Zheng Ce","year":"2021","unstructured":"Ce Zheng, Sijie Zhu, Matias Mendieta, Taojiannan Yang, Chen Chen, and Zhengming Ding. 2021. text3D human pose estimation with spatial and temporal transformers. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 11656-11665."},{"key":"e_1_3_2_1_66_1","first-page":"11497","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"37","author":"Zuo Rundong","year":"2023","unstructured":"Rundong Zuo, Guozhong Li, Byron Choi, Sourav S Bhowmick, Daphne Ngar-yin Mah, and Grace LH Wong. 2023. textSVP-T: A shape-level variable-position transformer for multivariate time series classification. In Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 37. 11497-11505."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754807","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:46:47Z","timestamp":1765309607000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754807"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":66,"alternative-id":["10.1145\/3746027.3754807","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754807","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}