{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T18:40:06Z","timestamp":1748198406540,"version":"3.41.0"},"publisher-location":"Cham","reference-count":42,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031915741"},{"type":"electronic","value":"9783031915758"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-91575-8_2","type":"book-chapter","created":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T17:57:49Z","timestamp":1748195869000},"page":"19-35","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["HybridFormer: Bridging Local and\u00a0Global Spatio-Temporal Dynamics for\u00a0Efficient Skeleton-Based Action Recognition"],"prefix":"10.1007","author":[{"given":"Zeyun","family":"Zhong","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianrui","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Manuel","family":"Martin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mickael","family":"Cormier","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengzhi","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Frederik","family":"Diederichs","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Juergen","family":"Beyerer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"2_CR1","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: ViViT: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"2_CR2","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. arXiv preprint arXiv:1607.06450 (2016)"},{"key":"2_CR3","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, vol.\u00a033, pp. 1877\u20131901 (2020)"},{"key":"2_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Y., Zhang, Z., Yuan, C., Li, B., Deng, Y., Hu, W.: Channel-wise topology refinement graph convolution for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13359\u201313368 (2021)","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"2_CR5","doi-asserted-by":"crossref","unstructured":"Cheng, K., Zhang, Y., Cao, C., Shi, L., Cheng, J., Lu, H.: Decoupling GCN with dropgraph module for skeleton-based action recognition. In: European Conference on Computer Vision, pp. 536\u2013553 (2020)","DOI":"10.1007\/978-3-030-58586-0_32"},{"key":"2_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, K., Zhang, Y., He, X., Chen, W., Cheng, J., Lu, H.: Skeleton-based action recognition with shift graph convolutional network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 183\u2013192 (2020)","DOI":"10.1109\/CVPR42600.2020.00026"},{"key":"2_CR7","doi-asserted-by":"crossref","unstructured":"Chi, H.G., Ha, M.H., Chi, S., Lee, S.W., Huang, Q., Ramani, K.: InfoGCN: representation learning for human skeleton-based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20186\u201320196 (2022)","DOI":"10.1109\/CVPR52688.2022.01955"},{"key":"2_CR8","doi-asserted-by":"crossref","unstructured":"Cui, Y., Jiang, C., Wang, L., Wu, G.: Mixformer: end-to-end tracking with iterative mixed attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13608\u201313618 (2022)","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"2_CR9","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.N.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186 (2018)"},{"key":"2_CR10","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR 2021: The Ninth International Conference on Learning Representations (2021)"},{"key":"2_CR11","doi-asserted-by":"crossref","unstructured":"Du, Y., Wang, W., Wang, L.: Hierarchical recurrent neural network for skeleton based action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1110\u20131118 (2015)","DOI":"10.1109\/CVPR.2015.7298714"},{"key":"2_CR12","doi-asserted-by":"crossref","unstructured":"Duan, H., Wang, J., Chen, K., Lin, D.: PYSKL: towards good practices for skeleton action recognition. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 7351\u20137354 (2022)","DOI":"10.1145\/3503161.3548546"},{"key":"2_CR13","doi-asserted-by":"publisher","unstructured":"Gong, Y., Chung, Y.A., Glass, J.: AST: audio spectrogram transformer. In: Proceedings of the Interspeech 2021, pp. 571\u2013575 (2021). https:\/\/doi.org\/10.21437\/Interspeech.2021-698","DOI":"10.21437\/Interspeech.2021-698"},{"key":"2_CR14","doi-asserted-by":"crossref","unstructured":"Ke, Q., Bennamoun, M., An, S., Sohel, F., Boussaid, F.: A new representation of skeleton sequences for 3d action recognition. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4570\u20134579 (2017)","DOI":"10.1109\/CVPR.2017.486"},{"key":"2_CR15","unstructured":"Kipf, T.N., Welling, M.: Semi-supervised classification with graph convolutional networks. In: ICLR (Poster) (2016)"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"Lee, J., Lee, M., Lee, D., Lee, S.: Hierarchically decomposed graph convolutional networks for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10444\u201310453 (2023)","DOI":"10.1109\/ICCV51070.2023.00958"},{"key":"2_CR17","doi-asserted-by":"crossref","unstructured":"Liu, H., Liu, Y., Chen, Y., Yuan, C., Li, B., Hu, W.: Transkeleton: hierarchical spatial-temporal transformer for skeleton-based action recognition. IEEE Trans. Circ. Syst. Video Technol. (2023)","DOI":"10.1109\/TCSVT.2023.3240472"},{"issue":"10","key":"2_CR18","doi-asserted-by":"publisher","first-page":"2684","DOI":"10.1109\/TPAMI.2019.2916873","volume":"42","author":"J Liu","year":"2019","unstructured":"Liu, J., Shahroudy, A., Perez, M., Wang, G., Duan, L.Y., Kot, A.C.: NTU RGB+D 120: a large-scale benchmark for 3d human activity understanding. IEEE Trans. Pattern Anal. Mach. Intell. 42(10), 2684\u20132701 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"68","key":"2_CR19","doi-asserted-by":"publisher","first-page":"346","DOI":"10.1016\/j.patcog.2017.02.030","volume":"68","author":"M Liu","year":"2017","unstructured":"Liu, M., Liu, H., Chen, C.: Enhanced skeleton visualization for view invariant human action recognition. Pattern Recogn. 68(68), 346\u2013362 (2017)","journal-title":"Pattern Recogn."},{"key":"2_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Z., et\u00a0al.: Swin transformer v2: scaling up capacity and resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12009\u201312019 (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"2_CR21","doi-asserted-by":"crossref","unstructured":"Liu, Z., Zhang, H., Chen, Z., Wang, Z., Ouyang, W.: Disentangling and unifying graph convolutions for skeleton-based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 143\u2013152 (2020)","DOI":"10.1109\/CVPR42600.2020.00022"},{"key":"2_CR22","doi-asserted-by":"crossref","unstructured":"Plizzari, C., Cannici, M., Matteucci, M.: Spatial temporal transformer network for skeleton-based action recognition. In: International Conference on Pattern Recognition Workshops and Challenges, pp. 694\u2013701. Springer (2021)","DOI":"10.1007\/978-3-030-68796-0_50"},{"key":"2_CR23","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"2_CR24","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.T., Wang, G.: NTU RGB+ D: a large scale dataset for 3D human activity analysis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1010\u20131019 (2016)","DOI":"10.1109\/CVPR.2016.115"},{"key":"2_CR25","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., Lu, H.: Two-stream adaptive graph convolutional networks for skeleton-based action recognition. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12026\u201312035 (2019)","DOI":"10.1109\/CVPR.2019.01230"},{"key":"2_CR26","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., Lu, H.: Decoupled spatial-temporal attention network for skeleton-based action-gesture recognition. In: Proceedings of the Asian Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-69541-5_3"},{"key":"2_CR27","doi-asserted-by":"crossref","unstructured":"Song, S., Lan, C., Xing, J., Zeng, W., Liu, J.: An end-to-end spatio-temporal attention model for human action recognition from skeleton data. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a031 (2017)","DOI":"10.1609\/aaai.v31i1.11212"},{"key":"2_CR28","doi-asserted-by":"crossref","unstructured":"Song, Y., Zhang, Z., Shan, C., Wang, L.: Constructing stronger and faster baselines for skeleton-based action recognition. IEEE Trans. Pattern Anal. Mach. Intell. PP (2022)","DOI":"10.1109\/TPAMI.2022.3157033"},{"key":"2_CR29","unstructured":"Tolstikhin, I., et al.: MLP-mixer: an all-MLP architecture for vision. arXiv preprint arXiv:2105.01601 (2021)"},{"key":"2_CR30","first-page":"10078","volume":"35","author":"Z Tong","year":"2022","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. Adv. Neural. Inf. Process. Syst. 35, 10078\u201310093 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2_CR31","unstructured":"Touvron, H., Cord, M., Matthijs, D., Massa, F., Sablayrolles, A., Jegou, H.: Training data-efficient image transformers & distillation through attention. In: ICML 2021: 38th International Conference on Machine Learning (2021)"},{"key":"2_CR32","unstructured":"Touvron, H., et\u00a0al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"2_CR33","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"2_CR34","doi-asserted-by":"crossref","unstructured":"Wang, J., Nie, X., Xia, Y., Wu, Y., Zhu, S.C.: Cross-view action modeling, learning and recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2649\u20132656 (2014)","DOI":"10.1109\/CVPR.2014.339"},{"key":"2_CR35","doi-asserted-by":"crossref","unstructured":"Xin, W., Miao, Q., Liu, Y., Liu, R., Pun, C.M., Shi, C.: Skeleton mixformer: multivariate topology representation for skeleton-based action recognition. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 2211\u20132220 (2023)","DOI":"10.1145\/3581783.3611900"},{"key":"2_CR36","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: AAAI, pp. 7444\u20137452 (2018)","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"2_CR37","doi-asserted-by":"crossref","unstructured":"Ye, F., Pu, S., Zhong, Q., Li, C., Xie, D., Tang, H.: Dynamic GCN: context-enriched topology learning for skeleton-based action recognition. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 55\u201363 (2020)","DOI":"10.1145\/3394171.3413941"},{"key":"2_CR38","doi-asserted-by":"crossref","unstructured":"Zhang, P., Lan, C., Xing, J., Zeng, W., Xue, J., Zheng, N.: View adaptive recurrent neural networks for high performance human action recognition from skeleton data. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2117\u20132126 (2017)","DOI":"10.1109\/ICCV.2017.233"},{"key":"2_CR39","doi-asserted-by":"crossref","unstructured":"Zhang, P., Lan, C., Zeng, W., Xing, J., Xue, J., Zheng, N.: Semantics-guided neural networks for efficient skeleton-based human action recognition. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1112\u20131121 (2020)","DOI":"10.1109\/CVPR42600.2020.00119"},{"key":"2_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Wu, B., Li, W., Duan, L., Gan, C.: STST: spatial-temporal specialized transformer for skeleton-based action recognition. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 3229\u20133237 (2021)","DOI":"10.1145\/3474085.3475473"},{"key":"2_CR41","doi-asserted-by":"crossref","unstructured":"Zhou, H., Liu, Q., Wang, Y.: Learning discriminative representations for skeleton based action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10608\u201310617 (2023)","DOI":"10.1109\/CVPR52729.2023.01022"},{"key":"2_CR42","unstructured":"Zhou, Y., et al.: Hypergraph transformer for skeleton-based action recognition. arXiv preprint arXiv:2211.09590 (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-91575-8_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,25]],"date-time":"2025-05-25T17:58:03Z","timestamp":1748195883000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-91575-8_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031915741","9783031915758"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-91575-8_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}