{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,11]],"date-time":"2025-03-11T04:06:56Z","timestamp":1741666016134,"version":"3.38.0"},"reference-count":72,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T00:00:00Z","timestamp":1724976000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T00:00:00Z","timestamp":1724976000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["No.2022YFB3305803","No.2022YFB3305803"],"award-info":[{"award-number":["No.2022YFB3305803","No.2022YFB3305803"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1007\/s00371-024-03601-1","type":"journal-article","created":{"date-parts":[[2024,8,30]],"date-time":"2024-08-30T07:02:42Z","timestamp":1725001362000},"page":"3263-3281","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["TP-LSM: visual temporal pyramidal time modeling network to multi-label action detection in image-based AI"],"prefix":"10.1007","volume":"41","author":[{"given":"Haojie","family":"Gao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peishun","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaolong","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zikang","family":"Yan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ningning","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenqiang","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuefang","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruichun","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,30]]},"reference":[{"issue":"2","key":"3601_CR1","doi-asserted-by":"publisher","first-page":"924","DOI":"10.1109\/TIP.2018.2872628","volume":"28","author":"X Nie","year":"2019","unstructured":"Nie, X., Feng, J., Xing, J., Xiao, S., Yan, S.: Hierarchical contextual refinement networks for human pose estimation. IEEE Trans. Image Process. 28(2), 924\u2013936 (2019). https:\/\/doi.org\/10.1109\/TIP.2018.2872628","journal-title":"IEEE Trans. Image Process."},{"issue":"3\u20134","key":"3601_CR2","doi-asserted-by":"publisher","first-page":"2078","DOI":"10.1002\/cav.2078","volume":"33","author":"Y Wu","year":"2022","unstructured":"Wu, Y., Wang, C.: Parallel-branch network for 3D human pose and shape estimation in video. Comput. Animat. Virtual Worlds 33(3\u20134), 2078 (2022). https:\/\/doi.org\/10.1002\/cav.2078","journal-title":"Comput. Animat. Virtual Worlds"},{"issue":"3\u20134","key":"3601_CR3","doi-asserted-by":"publisher","first-page":"2187","DOI":"10.1002\/cav.2187","volume":"34","author":"L Sun","year":"2023","unstructured":"Sun, L., Tang, T., Qu, Y., Qin, W.: Bidirectional temporal feature for 3D human pose and shape estimation from a video. Comput. Animat. Virtual Worlds 34(3\u20134), 2187 (2023). https:\/\/doi.org\/10.1002\/cav.2187","journal-title":"Comput. Animat. Virtual Worlds"},{"issue":"4\u20135","key":"3601_CR4","doi-asserted-by":"publisher","first-page":"1958","DOI":"10.1002\/cav.1958","volume":"31","author":"Y Li","year":"2020","unstructured":"Li, Y., Qiu, L., Wang, L., Liu, F., Wang, Z., Iulian Poiana, S., Yang, X., Zhang, J.: Densely connected GCN model for motion prediction. Comput. Animat. Virtual Worlds 31(4\u20135), 1958 (2020). https:\/\/doi.org\/10.1002\/cav.1958","journal-title":"Comput. Animat. Virtual Worlds"},{"issue":"9","key":"3601_CR5","doi-asserted-by":"publisher","first-page":"4800","DOI":"10.1109\/TNNLS.2021.3061115","volume":"33","author":"C Li","year":"2022","unstructured":"Li, C., Xie, C., Zhang, B., Han, J., Zhen, X., Chen, J.: Memory attention networks for skeleton-based action recognition. IEEE Trans. Neural Netw. Learn. Syst. 33(9), 4800\u20134814 (2022). https:\/\/doi.org\/10.1109\/TNNLS.2021.3061115","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"issue":"3\u20134","key":"3601_CR6","doi-asserted-by":"publisher","first-page":"2070","DOI":"10.1002\/cav.2070","volume":"33","author":"Q Xu","year":"2022","unstructured":"Xu, Q., Liu, F., Fu, Z., Zhou, A., Qi, J.: AeS-GCN: attention-enhanced semantic-guided graph convolutional networks for skeleton-based action recognition. Comput. Animat. Virtual Worlds 33(3\u20134), 2070 (2022). https:\/\/doi.org\/10.1002\/cav.2070","journal-title":"Comput. Animat. Virtual Worlds"},{"issue":"5","key":"3601_CR7","doi-asserted-by":"publisher","first-page":"2575","DOI":"10.1109\/TVCG.2023.3247075","volume":"29","author":"Y Liu","year":"2023","unstructured":"Liu, Y., Zhang, H., Li, Y., He, K., Xu, D.: Skeleton-based human action recognition via large-kernel attention graph convolutional network. IEEE Trans. Vis. Comput. Graph. 29(5), 2575\u20132585 (2023). https:\/\/doi.org\/10.1109\/TVCG.2023.3247075","journal-title":"IEEE Trans. Vis. Comput. Graph."},{"key":"3601_CR8","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-031-50075-6_9","volume-title":"Advances in Computer Graphics","author":"W Zhao","year":"2024","unstructured":"Zhao, W., Peng, J., Lv, N.: MS-GTR: multi-stream graph transformer for skeleton-based action recognition. In: Sheng, B., Bi, L., Kim, J., Magnenat-Thalmann, N., Thalmann, D. (eds.) Advances in Computer Graphics, pp. 104\u2013118. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-50075-6_9"},{"issue":"4","key":"3601_CR9","doi-asserted-by":"publisher","first-page":"1126","DOI":"10.1109\/TIP.2010.2076821","volume":"20","author":"A Oikonomopoulos","year":"2011","unstructured":"Oikonomopoulos, A., Patras, I., Pantic, M.: Spatiotemporal localization and categorization of human actions in unsegmented image sequences. IEEE Trans. Image Process. 20(4), 1126\u20131140 (2011). https:\/\/doi.org\/10.1109\/TIP.2010.2076821","journal-title":"IEEE Trans. Image Process."},{"key":"3601_CR10","doi-asserted-by":"publisher","first-page":"5154","DOI":"10.1109\/TIP.2021.3078324","volume":"30","author":"L Huang","year":"2021","unstructured":"Huang, L., Huang, Y., Ouyang, W., Wang, L.: Modeling sub-actions for weakly supervised temporal action localization. IEEE Trans. Image Process. 30, 5154\u20135167 (2021). https:\/\/doi.org\/10.1109\/TIP.2021.3078324","journal-title":"IEEE Trans. Image Process."},{"issue":"4","key":"3601_CR11","doi-asserted-by":"publisher","first-page":"1852","DOI":"10.1109\/TNNLS.2019.2962815","volume":"34","author":"X-Y Zhang","year":"2023","unstructured":"Zhang, X.-Y., Li, C., Shi, H., Zhu, X., Li, P., Dong, J.: AdapNet: adaptability decomposing encoder-decoder network for weakly supervised action recognition and localization. IEEE Trans. Neural Netw. Learn. Syst. 34(4), 1852\u20131863 (2023). https:\/\/doi.org\/10.1109\/TNNLS.2019.2962815","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"issue":"5","key":"3601_CR12","doi-asserted-by":"publisher","first-page":"2107","DOI":"10.1002\/cav.2107","volume":"33","author":"M Almushyti","year":"2022","unstructured":"Almushyti, M., Li, F.W.B.: Distillation of human-object interaction contexts for action recognition. Comput. Animat. Virtual Worlds 33(5), 2107 (2022). https:\/\/doi.org\/10.1002\/cav.2107","journal-title":"Comput. Animat. Virtual Worlds"},{"key":"3601_CR13","doi-asserted-by":"crossref","unstructured":"Dai, R., Das, S., Kahatapitiya, K., Ryoo, M.S., Br\u00e9mond, F.: MS-TCT: Multi-scale Temporal ConvTransformer for Action Detection, pp. 20041\u201320051 (2022). https:\/\/openaccess.thecvf.com\/content\/CVPR2022\/html\/Dai_MS-TCT_Multi-Scale_Temporal_ConvTransformer_for_Action_Detection_CVPR_2022_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR52688.2022.01941"},{"key":"3601_CR14","doi-asserted-by":"crossref","unstructured":"Sardari, F., Mustafa, A., Jackson, P.J.B., Hilton, A.: PAT: Position-Aware Transformer for Dense Multi-Label Action Detection, pp. 2988\u20132997 (2023). https:\/\/openaccess.thecvf.com\/content\/ICCV2023W\/CVEU\/html\/Sardari_PAT_Position-Aware_Transformer_for_Dense_Multi-Label_Action_Detection_ICCVW_2023_paper.html Accessed 28 October 2023","DOI":"10.1109\/ICCVW60793.2023.00321"},{"key":"3601_CR15","doi-asserted-by":"publisher","first-page":"28","DOI":"10.1007\/978-3-031-50069-5_4","volume-title":"Advances in Computer Graphics","author":"B Deng","year":"2024","unstructured":"Deng, B., Zhao, S., Liu, D.: TadML: A Fast Temporal Action Detection with Mechanics-MLP. In: Sheng, B., Bi, L., Kim, J., Magnenat-Thalmann, N., Thalmann, D. (eds.) Advances in Computer Graphics, pp. 28\u201340. Springer, Cham (2024). https:\/\/doi.org\/10.1007\/978-3-031-50069-5_4"},{"issue":"2","key":"3601_CR16","doi-asserted-by":"publisher","first-page":"375","DOI":"10.1007\/s11263-017-1013-y","volume":"126","author":"S Yeung","year":"2018","unstructured":"Yeung, S., Russakovsky, O., Jin, N., Andriluka, M., Mori, G., Fei-Fei, L.: Every moment counts: dense detailed labeling of actions in complex videos. Int. J. Comput. Vision 126(2), 375\u2013389 (2018). https:\/\/doi.org\/10.1007\/s11263-017-1013-y","journal-title":"Int. J. Comput. Vision"},{"key":"3601_CR17","doi-asserted-by":"crossref","unstructured":"Lea, C., Flynn, M.D., Vidal, R., Reiter, A., Hager, G.D.: Temporal Convolutional Networks for Action Segmentation and Detection, pp. 156\u2013165 (2017). https:\/\/openaccess.thecvf.com\/content_cvpr_2017\/html\/Lea_Temporal_Convolutional_Networks_CVPR_2017_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR.2017.113"},{"key":"3601_CR18","unstructured":"Piergiovanni, A., Ryoo, M.: Temporal gaussian mixture layer for videos. In: Proceedings of the 36th International Conference on Machine Learning, pp. 5152\u20135161. PMLR (2019). https:\/\/proceedings.mlr.press\/v97\/piergiovanni19a.html. Accessed 28 October 2023"},{"key":"3601_CR19","doi-asserted-by":"crossref","unstructured":"Dai, R., Das, S., Minciullo, L., Garattoni, L., Francesca, G., Bremond, F.: PDAN: Pyramid Dilated Attention Network for Action Detection, pp. 2970\u20132979 (2021). https:\/\/openaccess.thecvf.com\/content\/WACV2021\/html\/Dai_PDAN_Pyramid_Dilated_Attention_Network_for_Action_Detection_WACV_2021_paper.html Accessed 28 October 2023","DOI":"10.1109\/WACV48630.2021.00301"},{"key":"3601_CR20","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space\u2013time attention all you need for video understanding? In: Proceedings of the 38th International Conference on Machine Learning, pp. 813\u2013824. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/bertasius21a.html Accessed 28 October 2023"},{"key":"3601_CR21","doi-asserted-by":"crossref","unstructured":"Tan, J., Tang, J., Wang, L., Wu, G.: Relaxed Transformer Decoders for Direct Action Proposal Generation, pp. 13526\u201313535 (2021). https:\/\/openaccess.thecvf.com\/content\/ICCV2021\/html\/Tan_Relaxed_Transformer_Decoders_for_Direct_Action_Proposal_Generation_ICCV_2021_paper.html?ref=https:\/\/githubhelp.com Accessed 28 October 2023","DOI":"10.1109\/ICCV48922.2021.01327"},{"key":"3601_CR22","doi-asserted-by":"publisher","first-page":"5427","DOI":"10.1109\/TIP.2022.3195321","volume":"31","author":"X Liu","year":"2022","unstructured":"Liu, X., Wang, Q., Hu, Y., Tang, X., Zhang, S., Bai, S., Bai, X.: End-to-end temporal action detection with transformer. IEEE Trans. Image Process. 31, 5427\u20135441 (2022). https:\/\/doi.org\/10.1109\/TIP.2022.3195321","journal-title":"IEEE Trans. Image Process."},{"key":"3601_CR23","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Proceedings of Advances in Neural Information Processing Systems, vol. 27. Curran Associates Inc. (2014). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2014\/hash\/00ec53c4682d36f5c4359f4ae7bd7ba1-Abstract.html Accessed 14 February 2024"},{"issue":"4\u20135","key":"3601_CR24","doi-asserted-by":"publisher","first-page":"1952","DOI":"10.1002\/cav.1952","volume":"31","author":"Z Chen","year":"2020","unstructured":"Chen, Z., Pan, J., Yang, X., Qin, H.: Hybrid features for skeleton-based action recognition based on network fusion. Comput. Animat. Virtual Worlds 31(4\u20135), 1952 (2020). https:\/\/doi.org\/10.1002\/cav.1952","journal-title":"Comput. Animat. Virtual Worlds"},{"key":"3601_CR25","doi-asserted-by":"crossref","unstructured":"Kahatapitiya, K., Ryoo, M.S.: Coarse-Fine Networks for Temporal Activity Detection in Videos, pp. 8385\u20138394 (2021). https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Kahatapitiya_Coarse-Fine_Networks_for_Temporal_Activity_Detection_in_Videos_CVPR_2021_paper.html Accessed 28 October 2023","DOI":"10.1109\/CVPR46437.2021.00828"},{"issue":"6","key":"3601_CR26","doi-asserted-by":"publisher","first-page":"2193","DOI":"10.1002\/cav.2193","volume":"34","author":"J Lu","year":"2023","unstructured":"Lu, J., Gong, Y., Zhou, Y., Ma, C., Huang, T.: CHAN: skeleton based action recognition by multi-level feature learning. Comput. Animat. Virtual Worlds 34(6), 2193 (2023). https:\/\/doi.org\/10.1002\/cav.2193","journal-title":"Comput. Animat. Virtual Worlds"},{"key":"3601_CR27","doi-asserted-by":"publisher","first-page":"2221","DOI":"10.1002\/cav.2221","volume":"35","author":"X Li","year":"2024","unstructured":"Li, X., Lu, J., Zhou, J., Liu, W., Zhang, K.: Multi-temporal scale aggregation refinement graph convolutional network for skeleton-based action recognition. Comput. Animat. Virtual Worlds 35, 2221 (2024). https:\/\/doi.org\/10.1002\/cav.2221","journal-title":"Comput. Animat. Virtual Worlds"},{"key":"3601_CR28","doi-asserted-by":"crossref","unstructured":"Shi, D., Zhong, Y., Cao, Q., Ma, L., Li, J., Tao, D.: TriDet: Temporal Action Detection with Relative Boundary Modeling, pp. 18857\u201318866 (2023). https:\/\/openaccess.thecvf.com\/content\/CVPR2023\/html\/Shi_TriDet_Temporal_Action_Detection_With_Relative_Boundary_Modeling_CVPR_2023_paper.html. Accessed 15 February 2024","DOI":"10.1109\/CVPR52729.2023.01808"},{"key":"3601_CR29","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3D: Expanding Architectures for Efficient Video Recognition, pp. 203\u2013213 (2020). https:\/\/openaccess.thecvf.com\/content_CVPR_2020\/html\/Feichtenhofer_X3D_Expanding_Architectures_for_Efficient_Video_Recognition_CVPR_2020_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"3601_CR30","unstructured":"Dai, R., Das, S., Br\u00e9mond, F.: CTRN: class-temporal relational network for action detection. In: Proceedings of 32nd British Machine Vision Conference 2021, BMVC 2021, Online, November 22\u201325, 2021, p. 224. BMVA Press (2021). https:\/\/www.bmvc2021-virtualconference.com\/assets\/papers\/0133.pdf. Accessed 28 October 2023"},{"key":"3601_CR31","doi-asserted-by":"publisher","unstructured":"Dai, X., Singh, B., Ng, J.Y.-H., Davis, L.: TAN: Temporal aggregation network for dense multi-label action recognition. In: Proceedings of 2019 IEEE Winter Conference on Applications of Computer Vision (WACV), pp. 151\u2013160 (2019). https:\/\/doi.org\/10.1109\/WACV.2019.00022. https:\/\/ieeexplore.ieee.org\/abstract\/document\/8658927. Accessed 28 October 2023","DOI":"10.1109\/WACV.2019.00022"},{"key":"3601_CR32","doi-asserted-by":"crossref","unstructured":"Tirupattur, P., Duarte, K., Rawat, Y.S., Shah, M.: Modeling Multi-Label Action Dependencies for Temporal Action Localization, pp. 1460\u20131470 (2021). https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Tirupattur_Modeling_Multi-Label_Action_Dependencies_for_Temporal_Action_Localization_CVPR_2021_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR46437.2021.00151"},{"issue":"3","key":"3601_CR33","doi-asserted-by":"publisher","first-page":"733","DOI":"10.1162\/coli_a_00445","volume":"48","author":"P Dufter","year":"2022","unstructured":"Dufter, P., Schmitt, M., Sch\u00fctze, H.: Position information in transformers: an overview. Comput. Linguist. 48(3), 733\u2013763 (2022). https:\/\/doi.org\/10.1162\/coli_a_00445","journal-title":"Comput. Linguist."},{"key":"3601_CR34","unstructured":"Li, Y., Si, S., Li, G., Hsieh, C.-J., Bengio, S.: Learnable fourier features for multi-dimensional spatial positional encoding. In: Advances in Neural Information Processing Systems, vol. 34, pp. 15816\u201315829. Curran Associates Inc. (2021). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2021\/hash\/84c2d4860a0fc27bcf854c444fb8b400-Abstract.html. Accessed 28 October 2023"},{"key":"3601_CR35","doi-asserted-by":"publisher","unstructured":"Shaw, P., Uszkoreit, J., Vaswani, A.: Self-attention with relative position representations. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers), pp. 464\u2013468. Association for Computational Linguistics, New Orleans (2018). https:\/\/doi.org\/10.18653\/v1\/N18-2074. https:\/\/aclanthology.org\/N18-2074. Accessed 28 October 2023","DOI":"10.18653\/v1\/N18-2074"},{"key":"3601_CR36","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Dollar, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature Pyramid Networks for Object Detection, pp. 2117\u20132125 (2017). https:\/\/openaccess.thecvf.com\/content_cvpr_2017\/html\/Lin_Feature_Pyramid_Networks_CVPR_2017_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR.2017.106"},{"key":"3601_CR37","doi-asserted-by":"crossref","unstructured":"Lin, T., Liu, X., Li, X., Ding, E., Wen, S.: BMN: Boundary-Matching Network for Temporal Action Proposal Generation, pp. 3889\u20133898 (2019). https:\/\/openaccess.thecvf.com\/content_ICCV_2019\/html\/Lin_BMN_Boundary-Matching_Network_for_Temporal_Action_Proposal_Generation_ICCV_2019_paper.html. Accessed 28 October 2023","DOI":"10.1109\/ICCV.2019.00399"},{"key":"3601_CR38","doi-asserted-by":"crossref","unstructured":"Chao, Y.-W., Vijayanarasimhan, S., Seybold, B., Ross, D.A., Deng, J., Sukthankar, R.: Rethinking the Faster R-CNN Architecture for Temporal Action Localization, pp. 1130\u20131139 (2018). https:\/\/openaccess.thecvf.com\/content_cvpr_2018\/html\/Chao_Rethinking_the_Faster_CVPR_2018_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR.2018.00124"},{"key":"3601_CR39","doi-asserted-by":"crossref","unstructured":"Lin, C., Xu, C., Luo, D., Wang, Y., Tai, Y., Wang, C., Li, J., Huang, F., Fu, Y.: Learning Salient Boundary Feature for Anchor-Free Temporal Action Localization, pp. 3320\u20133329 (2021). https:\/\/openaccess.thecvf.com\/content\/CVPR2021\/html\/Lin_Learning_Salient_Boundary_Feature_for_Anchor-free_Temporal_Action_Localization_CVPR_2021_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR46437.2021.00333"},{"key":"3601_CR40","doi-asserted-by":"publisher","unstructured":"Ma, F., Zhu, L., Yang, Y., Zha, S., Kundu, G., Feiszli, M., Shou, Z.: SF-Net: single-frame supervision for temporal action localization. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision\u2014ECCV 2020. Lecture Notes in Computer Science, pp. 420\u2013437. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_25","DOI":"10.1007\/978-3-030-58548-8_25"},{"key":"3601_CR41","doi-asserted-by":"publisher","unstructured":"Zhang, C.-L., Wu, J., Li, Y.: ActionFormer: localizing moments of actions with transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision\u2014 ECCV 2022. Lecture Notes in Computer Science, pp. 492\u2013510. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19772-7_29","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"3601_CR42","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhao, C., Rojas, D.S., Thabet, A., Ghanem, B.: G-TAD: Sub-Graph Localization for Temporal Action Detection, pp. 10156\u201310165 (2020). https:\/\/openaccess.thecvf.com\/content_CVPR_2020\/html\/Xu_G-TAD_Sub-Graph_Localization_for_Temporal_Action_Detection_CVPR_2020_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR42600.2020.01017"},{"key":"3601_CR43","doi-asserted-by":"publisher","unstructured":"Chang, S., Wang, P., Wang, F., Li, H., Shou, Z.: Augmented transformer with adaptive graph for temporal action proposal generation. In: Proceedings of the 3rd International Workshop on Human-Centric Multimedia Analysis. HCMA \u201922, pp. 41\u201350. Association for Computing Machinery, New York (2022).https:\/\/doi.org\/10.1145\/3552458.3556443 . https:\/\/dl.acm.org\/doi\/10.1145\/3552458.3556443 Accessed 27 October 2023","DOI":"10.1145\/3552458.3556443"},{"key":"3601_CR44","doi-asserted-by":"crossref","unstructured":"Zhao, H., Torralba, A., Torresani, L., Yan, Z.: HACS: Human Action Clips and Segments Dataset for Recognition and Temporal Localization, pp. 8668\u20138678 (2019). https:\/\/openaccess.thecvf.com\/content_ICCV_2019\/html\/Zhao_HACS_Human_Action_Clips_and_Segments_Dataset_for_Recognition_and_ICCV_2019_paper.html Accessed 28 October 2023","DOI":"10.1109\/ICCV.2019.00876"},{"key":"3601_CR45","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Su, H., Wang, C., Yang, M.: BSN: Boundary Sensitive Network for Temporal Action Proposal Generation, pp. 3\u201319 (2018). https:\/\/openaccess.thecvf.com\/content_ECCV_2018\/html\/Tianwei_Lin_BSN_Boundary_Sensitive_ECCV_2018_paper.html. Accessed 28 October 2023","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"3601_CR46","doi-asserted-by":"crossref","unstructured":"Liu, Y., Ma, L., Zhang, Y., Liu, W., Chang, S.-F.: Multi-Granularity Generator for Temporal Action Proposal, pp. 3604\u20133613 (2019). https:\/\/openaccess.thecvf.com\/content_CVPR_2019\/html\/Liu_Multi-Granularity_Generator_for_Temporal_Action_Proposal_CVPR_2019_paper.html Accessed 28 October 2023","DOI":"10.1109\/CVPR.2019.00372"},{"key":"3601_CR47","doi-asserted-by":"crossref","unstructured":"Xu, H., Das, A., Saenko, K.: R-C3D: Region Convolutional 3D Network for Temporal Activity Detection, pp. 5783\u20135792 (2017). https:\/\/openaccess.thecvf.com\/content_iccv_2017\/html\/Xu_R-C3D_Region_Convolutional_ICCV_2017_paper.html. Accessed 28 October 2023","DOI":"10.1109\/ICCV.2017.617"},{"issue":"2","key":"3601_CR48","doi-asserted-by":"publisher","first-page":"337","DOI":"10.1109\/TMM.2019.2929923","volume":"22","author":"H Liu","year":"2020","unstructured":"Liu, H., Wang, S., Wang, W., Cheng, J.: Multi-scale based context-aware net for action detection. IEEE Trans. Multimed. 22(2), 337\u2013348 (2020). https:\/\/doi.org\/10.1109\/TMM.2019.2929923","journal-title":"IEEE Trans. Multimed."},{"key":"3601_CR49","doi-asserted-by":"publisher","first-page":"2672","DOI":"10.1109\/TMM.2020.3014555","volume":"23","author":"G Chen","year":"2021","unstructured":"Chen, G., Zhang, C., Zou, Y.: AFNet: temporal locality-aware network with dual structure for accurate and fast action detection. IEEE Trans. Multimed. 23, 2672\u20132682 (2021). https:\/\/doi.org\/10.1109\/TMM.2020.3014555","journal-title":"IEEE Trans. Multimed."},{"issue":"11","key":"3601_CR50","doi-asserted-by":"publisher","first-page":"5469","DOI":"10.1007\/s00371-022-02673-1","volume":"39","author":"X Yao","year":"2023","unstructured":"Yao, X., Zhang, J., Chen, R., Zhang, D., Zeng, Y.: Weakly supervised graph learning for action recognition in untrimmed video. Vis. Comput. 39(11), 5469\u20135483 (2023). https:\/\/doi.org\/10.1007\/s00371-022-02673-1","journal-title":"Vis. Comput."},{"key":"3601_CR51","doi-asserted-by":"crossref","unstructured":"Piergiovanni, A.J., Ryoo, M.S.: Learning Latent Super-Events to Detect Multiple Activities in Videos, pp. 5304\u20135313 (2018). https:\/\/openaccess.thecvf.com\/content_cvpr_2018\/html\/Piergiovanni_Learning_Latent_Super-Events_CVPR_2018_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR.2018.00556"},{"key":"3601_CR52","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: SlowFast Networks for Video Recognition, pp. 6202\u20136211 (2019). https:\/\/openaccess.thecvf.com\/content_ICCV_2019\/html\/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html. Accessed 28 October 2023","DOI":"10.1109\/ICCV.2019.00630"},{"key":"3601_CR53","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, ., Polosukhin, I.: Attention is All You Need. In: Advances in Neural Information Processing Systems, vol. 30. Curran Associates Inc. (2017). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html. Accessed 28 October 2023"},{"key":"3601_CR54","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1423 . https:\/\/aclanthology.org\/N19-1423. Accessed 28 October 2023","DOI":"10.18653\/v1\/N19-1423"},{"key":"3601_CR55","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N.: An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. (2020). https:\/\/openreview.net\/forum?id=YicbFdNTTy. Accessed 28 October 2023"},{"key":"3601_CR56","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin Transformer: Hierarchical Vision Transformer Using Shifted Windows, pp. 10012\u201310022 (2021). https:\/\/openaccess.thecvf.com\/content\/ICCV2021\/html\/Liu_Swin_Transformer_Hierarchical_Vision_Transformer_Using_Shifted_Windows_ICCV_2021_paper. Accessed 28 October 2023","DOI":"10.1109\/ICCV48922.2021.00986"},{"issue":"3","key":"3601_CR57","doi-asserted-by":"publisher","first-page":"415","DOI":"10.1007\/s41095-022-0274-8","volume":"8","author":"W Wang","year":"2022","unstructured":"Wang, W., Xie, E., Li, X., Fan, D.-P., Song, K., Liang, D., Lu, T., Luo, P., Shao, L.: PVT v2: improved baselines with pyramid vision transformer. Comput. Vis. Med. 8(3), 415\u2013424 (2022). https:\/\/doi.org\/10.1007\/s41095-022-0274-8","journal-title":"Comput. Vis. Med."},{"issue":"12","key":"3601_CR58","doi-asserted-by":"publisher","first-page":"7696","DOI":"10.1109\/TCSVT.2023.3278410","volume":"33","author":"X Zhou","year":"2023","unstructured":"Zhou, X., Wu, S., Shi, R., Zheng, B., Wang, S., Yin, H., Zhang, J., Yan, C.: Transformer-based multi-scale feature integration network for video saliency prediction. IEEE Trans. Circuits Syst. Video Technol. 33(12), 7696\u20137707 (2023). https:\/\/doi.org\/10.1109\/TCSVT.2023.3278410","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"5","key":"3601_CR59","doi-asserted-by":"publisher","first-page":"5436","DOI":"10.1109\/TPAMI.2022.3211006","volume":"45","author":"M-H Guo","year":"2023","unstructured":"Guo, M.-H., Liu, Z.-N., Mu, T.-J., Hu, S.-M.: Beyond self-attention: external attention using two linear layers for visual tasks. IEEE Trans. Pattern Anal. Mach. Intell. 45(5), 5436\u20135447 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2022.3211006","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3601_CR60","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset, pp. 6299\u20136308 (2017). https:\/\/openaccess.thecvf.com\/content_cvpr_2017\/html\/Carreira_Quo_Vadis_Action_CVPR_2017_paper.html. Accessed 28 October 2023","DOI":"10.1109\/CVPR.2017.502"},{"key":"3601_CR61","doi-asserted-by":"crossref","unstructured":"Ridnik, T., Ben-Baruch, E., Zamir, N., Noy, A., Friedman, I., Protter, M., Zelnik-Manor, L.: Asymmetric Loss for Multi-Label Classification, pp. 82\u201391 (2021). https:\/\/openaccess.thecvf.com\/content\/ICCV2021\/html\/Ridnik_Asymmetric_Loss_for_Multi-Label_Classification_ICCV_2021_paper.html. Accessed 28 October 2023","DOI":"10.1109\/ICCV48922.2021.00015"},{"key":"3601_CR62","doi-asserted-by":"publisher","unstructured":"Huang, Y., Qi, J., Wang, X., Lin, Z.: Asymmetric polynomial loss for multi-label classification. In: ICASSP 2023\u20132023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135 (2023). https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10095437. https:\/\/ieeexplore.ieee.org\/abstract\/document\/10095437. Accessed 28 October 2023","DOI":"10.1109\/ICASSP49357.2023.10095437"},{"key":"3601_CR63","doi-asserted-by":"publisher","unstructured":"Nam, J., Kim, J., Loza\u00a0Menc\u00eda, E., Gurevych, I., F\u00fcrnkranz, J.: Large-scale multi-label text classification\u2014revisiting neural networks. In: Calders, T., Esposito, F., H\u00fcllermeier, E., Meo, R. (eds.) Machine Learning and Knowledge Discovery in Databases. Lecture Notes in Computer Science, pp. 437\u2013452. Springer, Berlin (2014). https:\/\/doi.org\/10.1007\/978-3-662-44851-9_28","DOI":"10.1007\/978-3-662-44851-9_28"},{"issue":"2","key":"3601_CR64","doi-asserted-by":"publisher","first-page":"2533","DOI":"10.1109\/TPAMI.2022.3169976","volume":"45","author":"R Dai","year":"2023","unstructured":"Dai, R., Das, S., Sharma, S., Minciullo, L., Garattoni, L., Bremond, F., Francesca, G.: Toyota Smarthome untrimmed: real-world untrimmed videos for activity detection. IEEE Trans. Pattern Anal. Mach. Intell. 45(2), 2533\u20132550 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2022.3169976","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3601_CR65","doi-asserted-by":"publisher","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) Computer Vision\u2014ECCV 2016. Lecture Notes in Computer Science, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"3601_CR66","unstructured":"Jiang, Y.-G., Liu, J., Zamir, A.R., Toderici, G., Laptev, I., Shah, M., Sukthankar, R.: THUMOS Challenge: Action Recognition with A Large Number of Classes (2014)"},{"key":"3601_CR67","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: Bengio, Y., LeCun, Y. (eds.) 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7\u20139, 2015, Conference Track Proceedings (2015). arXiv:1412.6980. Accessed 28 October 2023"},{"key":"3601_CR68","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning Spatiotemporal Features with 3D Convolutional Networks, pp. 4489\u20134497 (2015). https:\/\/openaccess.thecvf.com\/content_iccv_2015\/html\/Tran_Learning_Spatiotemporal_Features_ICCV_2015_paper.html. Accessed 28 October 2023","DOI":"10.1109\/ICCV.2015.510"},{"key":"3601_CR69","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2023.104740","volume":"137","author":"S Li","year":"2023","unstructured":"Li, S., Wang, Z., Liu, Y., Zhang, Y., Zhu, J., Cui, X., Liu, J.: FSformer: fast\u2013slow transformer for video action recognition. Image Vis. Comput. 137, 104740 (2023). https:\/\/doi.org\/10.1016\/j.imavis.2023.104740","journal-title":"Image Vis. Comput."},{"key":"3601_CR70","doi-asserted-by":"crossref","unstructured":"Zeng, Y., Zhong, Y., Feng, C., Ma, L.: Unimd: Towards Unifying Moment Retrieval and Temporal Action Detection. CoRR (2024). arXiv:2404.04933","DOI":"10.1007\/978-3-031-72952-2_17"},{"key":"3601_CR71","doi-asserted-by":"publisher","first-page":"7398","DOI":"10.1109\/TMM.2024.3367599","volume":"26","author":"Q Li","year":"2024","unstructured":"Li, Q., Zu, G., Xu, H., Kong, J., Zhang, Y., Wang, J.: An adaptive dual selective transformer for temporal action localization. IEEE Trans. Multimed. 26, 7398\u20137412 (2024). https:\/\/doi.org\/10.1109\/TMM.2024.3367599","journal-title":"IEEE Trans. Multimed."},{"key":"3601_CR72","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109135","volume":"134","author":"Y Tang","year":"2023","unstructured":"Tang, Y., Zheng, Y., Wei, C., Guo, K., Hu, H., Liang, J.: Video representation learning for temporal action detection using global-local attention. Pattern Recogn. 134, 109135 (2023). https:\/\/doi.org\/10.1016\/j.patcog.2022.109135","journal-title":"Pattern Recogn."}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03601-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-024-03601-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03601-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T09:08:48Z","timestamp":1741597728000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-024-03601-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,30]]},"references-count":72,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,3]]}},"alternative-id":["3601"],"URL":"https:\/\/doi.org\/10.1007\/s00371-024-03601-1","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"type":"print","value":"0178-2789"},{"type":"electronic","value":"1432-2315"}],"subject":[],"published":{"date-parts":[[2024,8,30]]},"assertion":[{"value":"1 August 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 August 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no Conflict of interest to disclose in any material discussed in this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"This study does not involve any human participant or animal. All the data used in this article are sourced from open and publicly accessible platforms. No proprietary, confidential, or private data has been used.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}},{"value":"The authors have no relevant financial or non-financial interests to disclose.","order":4,"name":"Ethics","group":{"name":"EthicsHeading","label":"Financial or Non-financial interests"}}]}}