{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T20:10:09Z","timestamp":1755893409446,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,15]],"date-time":"2023-12-15T00:00:00Z","timestamp":1702598400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,15]]},"DOI":"10.1145\/3627631.3627640","type":"proceedings-article","created":{"date-parts":[[2024,1,31]],"date-time":"2024-01-31T12:08:32Z","timestamp":1706702912000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["STTGC-Net: Spatial-Temporal Transformer with Graph Convolution for Skeleton-Based Action Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-7470-1578","authenticated-orcid":false,"given":"Tanishka","family":"Yagneshwar","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, Shiv Nadar Institute of Eminence, IN"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2196-8980","authenticated-orcid":false,"given":"Snehasis","family":"Mukherjee","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, Shiv Nadar Institute of Eminence, Greater Noida, India, IN"}]}],"member":"320","published-online":{"date-parts":[[2024,1,31]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Anurag Arnab Mostafa Dehghani Georg Heigold Chen Sun Mario Lu\u010di\u0107 and Cordelia Schmid. 2021. ViViT: A Video Vision Transformer. arxiv:2103.15691\u00a0[cs.CV]","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3019293"},{"key":"e_1_3_2_2_3_1","unstructured":"Gedas Bertasius Heng Wang and Lorenzo Torresani. 2021. Is Space-Time Attention All You Need for Video Understanding?arxiv:2102.05095\u00a0[cs.CV]"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2929257"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Nicolas Carion Francisco Massa Gabriel Synnaeve Nicolas Usunier Alexander Kirillov and Sergey Zagoruyko. 2020. End-to-End Object Detection with Transformers. arxiv:2005.12872\u00a0[cs.CV]","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.3390\/app12189229"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"crossref","unstructured":"Yuxin Chen Ziqi Zhang Chunfeng Yuan Bing Li Ying Deng and Weiming Hu. 2021. Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition. In ICCV. 13359\u201313368.","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Zhan Chen Sicheng Li Bing Yang Qinghan Li and Hong Liu. 2021. Multi-Scale Spatial Temporal Graph Convolutional Network for Skeleton-Based Action Recognition. In AAAI. 1113\u20131122.","DOI":"10.1609\/aaai.v35i2.16197"},{"key":"e_1_3_2_2_9_1","volume-title":"Decoupling GCN with DropGraph Module for Skeleton-Based Action Recognition. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:227232051","author":"Cheng Ke","year":"2020","unstructured":"Ke Cheng, Yifan Zhang, Congqi Cao, Lei Shi, Jian Cheng, and Hanqing Lu. 2020. Decoupling GCN with DropGraph Module for Skeleton-Based Action Recognition. In European Conference on Computer Vision. https:\/\/api.semanticscholar.org\/CorpusID:227232051"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00026"},{"key":"e_1_3_2_2_11_1","unstructured":"Anton Chernyavskiy Dmitry Ilvovsky and Preslav Nakov. 2021. Transformers: \"The End of History\" for NLP?arxiv:2105.00813\u00a0[cs.CL]"},{"key":"e_1_3_2_2_12_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 20186\u201320196","author":"Ha Myoung\u00a0Hoon","year":"2022","unstructured":"Hyung-gun Chi, Myoung\u00a0Hoon Ha, Seunggeun Chi, Sang\u00a0Wan Lee, Qixing Huang, and Karthik Ramani. 2022. InfoGCN: Representation Learning for Human Skeleton-Based Action Recognition. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 20186\u201320196."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2019.8851702"},{"key":"e_1_3_2_2_14_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arxiv:2010.11929\u00a0[cs.CV]"},{"volume-title":"Revisiting Skeleton-based Action Recognition. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 2969\u20132978","author":"Duan H.","key":"e_1_3_2_2_15_1","unstructured":"H. Duan, Y. Zhao, K. Chen, D. Lin, and B. Dai. 2022. Revisiting Skeleton-based Action Recognition. In 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). 2969\u20132978."},{"key":"e_1_3_2_2_16_1","volume-title":"Revisiting Skeleton-based Action Recognition. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Duan Haodong","year":"2022","unstructured":"Haodong Duan, Yue Zhao, Kai Chen, Dian Shao, Dahua Lin, and Bo Dai. 2022. Revisiting Skeleton-based Action Recognition. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_2_17_1","volume-title":"Proceedings of the Asian Conference on Computer Vision (ACCV). 382\u2013398","author":"Gao Zhimin","year":"2022","unstructured":"Zhimin Gao, Peitao Wang, Pei Lv, Xiaoheng Jiang, Qidong Liu, Pichao Wang, Mingliang Xu, and Wanqing Li. 2022. Focal and Global Spatial-Temporal Transformer for Skeleton-based Action Recognition. In Proceedings of the Asian Conference on Computer Vision (ACCV). 382\u2013398."},{"key":"e_1_3_2_2_18_1","volume-title":"Skeleton Action Recognition?International Journal of Computer Vision 129","author":"Gupta Pranay","year":"2021","unstructured":"Pranay Gupta, Anirudh Thatipelli, Aditya Aggarwal, Shubh Maheshwari, Neel Trivedi, Sourav Das, and Ravi\u00a0Kiran Sarvadevabhatla. 2021. Quo Vadis, Skeleton Action Recognition?International Journal of Computer Vision 129 (2021), 2097\u20132112."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01157"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Lipeng Ke Kuan-Chuan Peng and Siwei Lyu. 2022. Towards To-a-T Spatio-Temporal Focus for Skeleton-Based Action Recognition. In AAAI. 1131\u20131139.","DOI":"10.1609\/aaai.v36i1.19998"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2812099"},{"key":"e_1_3_2_2_22_1","volume-title":"Kipf and Max Welling","author":"N.","year":"2017","unstructured":"Thomas\u00a0N. Kipf and Max Welling. 2017. Semi-Supervised Classification with Graph Convolutional Networks. arxiv:1609.02907\u00a0[cs.LG]"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Guy Lev Gil Sadeh Benjamin Klein and Lior Wolf. 2015. RNN Fisher Vectors for Action Recognition and Image Annotation. arxiv:1512.03958\u00a0[cs.CV]","DOI":"10.1007\/978-3-319-46466-4_50"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW.2017.8026282"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","unstructured":"Bin Li Xi Li Zhongfei Zhang and Fei Wu. 2019. Spatio-Temporal Graph Routing for Skeleton-Based Action Recognition. In Proceedings of the Thirty-Third AAAI Conference on Artificial Intelligence and Thirty-First Innovative Applications of Artificial Intelligence Conference and Ninth AAAI Symposium on Educational Advances in Artificial Intelligence (Honolulu Hawaii USA) (AAAI\u201919\/IAAI\u201919\/EAAI\u201919). AAAI Press Article 1050 8\u00a0pages. https:\/\/doi.org\/10.1609\/aaai.v33i01.33018561","DOI":"10.1609\/aaai.v33i01.33018561"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW.2017.8026285"},{"key":"e_1_3_2_2_27_1","unstructured":"Chao Li Qiaoyong Zhong Di Xie and Shiliang Pu. 2018. Co-occurrence Feature Learning from Skeleton Data for Action Recognition and Detection with Hierarchical Aggregation. arxiv:1804.06055\u00a0[cs.CV]"},{"key":"e_1_3_2_2_28_1","unstructured":"Maosen Li Siheng Chen Xu Chen Ya Zhang Yanfeng Wang and Qi Tian. 2019. Actional-Structural Graph Convolutional Networks for Skeleton-based Action Recognition. arxiv:1904.12659\u00a0[cs.CV]"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00234"},{"key":"e_1_3_2_2_30_1","unstructured":"Zhouhan Lin Minwei Feng Cicero\u00a0Nogueira dos Santos Mo Yu Bing Xiang Bowen Zhou and Yoshua Bengio. 2017. A Structured Self-attentive Sentence Embedding. arxiv:1703.03130\u00a0[cs.CL]"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3240472"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2771306"},{"key":"e_1_3_2_2_34_1","volume-title":"Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. arxiv:2103.14030\u00a0[cs.CV]","author":"Liu Ze","year":"2021","unstructured":"Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, and Baining Guo. 2021. Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. arxiv:2103.14030\u00a0[cs.CV]"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00022"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2021.3049691"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"crossref","unstructured":"Chiara Plizzari Marco Cannici and Matteo Matteucci. 2020. Spatial Temporal Transformer Network for Skeleton-based Action Recognition. In ICPR. 694\u2013701.","DOI":"10.1007\/978-3-030-68796-0_50"},{"key":"e_1_3_2_2_38_1","volume-title":"International Joint Conference on Neural Networks.","author":"Rustogi Anshul","year":"2022","unstructured":"Anshul Rustogi and Snehasis Mukherjee. 2022. Long-term Spatio-temporal Contrastive Learning framework for Skeleton Action Recognition. In International Joint Conference on Neural Networks."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.115"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00810"},{"key":"e_1_3_2_2_41_1","volume-title":"Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Shi Lei","year":"2019","unstructured":"Lei Shi, Yifan Zhang, Jian Cheng, and Hanqing Lu. 2019. Two-Stream Adaptive Graph Convolutional Networks for Skeleton-Based Action Recognition. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01230"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Lei Shi Yifan Zhang Jian Cheng and Hanqing Lu. 2020. Decoupled Spatial-Temporal Attention Network for Skeleton-Based Action Recognition. In ACCV.","DOI":"10.1007\/978-3-030-69541-5_3"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3142771"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2818328"},{"key":"e_1_3_2_2_46_1","unstructured":"Hugo Touvron Matthieu Cord Matthijs Douze Francisco Massa Alexandre Sablayrolles and Herv\u00e9 J\u00e9gou. 2021. Training data-efficient image transformers & distillation through attention. arxiv:2012.12877\u00a0[cs.CV]"},{"key":"e_1_3_2_2_47_1","volume-title":"arXiv preprint arXiv:1706.03762","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. arXiv preprint arXiv:1706.03762 (2017)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.82"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.387"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.3390\/s23125593"},{"key":"e_1_3_2_2_51_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097)","author":"Wu Felix","year":"2019","unstructured":"Felix Wu, Amauri Souza, Tianyi Zhang, Christopher Fifty, Tao Yu, and Kilian Weinberger. 2019. Simplifying Graph Convolutional Networks. In Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097), Kamalika Chaudhuri and Ruslan Salakhutdinov (Eds.). PMLR, 6861\u20136871. https:\/\/proceedings.mlr.press\/v97\/wu19e.html"},{"key":"e_1_3_2_2_52_1","volume-title":"CvT: Introducing Convolutions to Vision Transformers. CoRR abs\/2103.15808","author":"Wu Haiping","year":"2021","unstructured":"Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, and Lei Zhang. 2021. CvT: Introducing Convolutions to Vision Transformers. CoRR abs\/2103.15808 (2021). arXiv:2103.15808https:\/\/arxiv.org\/abs\/2103.15808"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"crossref","unstructured":"Sijie Yan Yuanjun Xiong and Dahua Lin. 2018. Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition. In AAAI Vol.\u00a032.","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CAC57257.2022.10055641"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"crossref","unstructured":"Fanfan Ye Shiliang Pu Qiaoyong Zhong Chao Li Di Xie and Huiming Tang. 2020. Dynamic GCN: Context-enriched Topology Learning for Skeleton-based Action Recognition. In ACM Multimedia. 55\u201363.","DOI":"10.1145\/3394171.3413941"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2021.3072345"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"crossref","unstructured":"Pengfei Zhang Cuiling Lan Wenjun Zeng Junliang Xing Jianru Xue and Nanning Zheng. 2020. Semantics-Guided Neural Networks for Efficient Skeleton-Based Human Action Recognition. arxiv:1904.01189\u00a0[cs.CV]","DOI":"10.1109\/CVPR42600.2020.00119"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01434"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475473"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2012.24"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"crossref","unstructured":"Sixiao Zheng Jiachen Lu Hengshuang Zhao Xiatian Zhu Zekun Luo Yabiao Wang Yanwei Fu Jianfeng Feng Tao Xiang Philip H.\u00a0S. Torr and Li Zhang. 2021. Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers. arxiv:2012.15840\u00a0[cs.CV]","DOI":"10.1109\/CVPR46437.2021.00681"}],"event":{"name":"ICVGIP '23: Indian Conference on Computer Vision, Graphics and Image Processing","acronym":"ICVGIP '23","location":"Rupnagar India"},"container-title":["Proceedings of the Fourteenth Indian Conference on Computer Vision, Graphics and Image Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627631.3627640","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627631.3627640","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:49:27Z","timestamp":1755892167000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627631.3627640"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,15]]},"references-count":61,"alternative-id":["10.1145\/3627631.3627640","10.1145\/3627631"],"URL":"https:\/\/doi.org\/10.1145\/3627631.3627640","relation":{},"subject":[],"published":{"date-parts":[[2023,12,15]]},"assertion":[{"value":"2024-01-31","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}