{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T04:29:01Z","timestamp":1771907341980,"version":"3.50.1"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2023,5,15]],"date-time":"2023-05-15T00:00:00Z","timestamp":1684108800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,5,15]],"date-time":"2023-05-15T00:00:00Z","timestamp":1684108800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["51935005"],"award-info":[{"award-number":["51935005"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Basic Research Key Project","award":["JCKY20200603C010"],"award-info":[{"award-number":["JCKY20200603C010"]}]},{"DOI":"10.13039\/501100005046","name":"Natural Science Foundation of Heilongjiang Province of China","doi-asserted-by":"crossref","award":["LH2021F023"],"award-info":[{"award-number":["LH2021F023"]}],"id":[{"id":"10.13039\/501100005046","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Science & Technology Planned Project of Heilongjiang Province of China","award":["GA21C031"],"award-info":[{"award-number":["GA21C031"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Pattern Anal Applic"],"published-print":{"date-parts":[[2023,8]]},"DOI":"10.1007\/s10044-023-01166-8","type":"journal-article","created":{"date-parts":[[2023,5,15]],"date-time":"2023-05-15T14:21:23Z","timestamp":1684160483000},"page":"1375-1393","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["TSRN: two-stage refinement network for temporal action segmentation"],"prefix":"10.1007","volume":"26","author":[{"given":"Xiaoyan","family":"Tian","sequence":"first","affiliation":[]},{"given":"Ye","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Xianglong","family":"Tang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,5,15]]},"reference":[{"issue":"2","key":"1166_CR1","doi-asserted-by":"publisher","first-page":"611","DOI":"10.1007\/s10044-019-00821-3","volume":"23","author":"IP Febin","year":"2020","unstructured":"Febin IP, Jayasree K, Joy PT (2020) Violence detection in videos for an intelligent surveillance system using MoBSIFT and movement filtering algorithm. Pattern Anal Appl 23(2):611\u2013623","journal-title":"Pattern Anal Appl"},{"key":"1166_CR2","doi-asserted-by":"publisher","first-page":"182","DOI":"10.1016\/j.jpdc.2018.06.012","volume":"120","author":"Z Pan","year":"2018","unstructured":"Pan Z, Liu S, Sangaiah AK, Muhammad K (2018) Visual attention feature (VAF): a novel strategy for visual tracking based on cloud platform in intelligent surveillance systems. J Parallel Distr Com 120:182\u2013194","journal-title":"J Parallel Distr Com"},{"issue":"4","key":"1166_CR3","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pcbi.1008935","volume":"17","author":"J Stenum","year":"2021","unstructured":"Stenum J, Rossi C, Roemmich RT (2021) Two-dimensional video-based analysis of human gait using pose estimation. Plos Comput Biol 17(4):e1008935","journal-title":"Plos Comput Biol"},{"key":"1166_CR4","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Pinz A, Wildes RP (2017) Spatiotemporal multiplier networks for video action recognition. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 4768\u20134777","DOI":"10.1109\/CVPR.2017.787"},{"key":"1166_CR5","unstructured":"Ding L, Xu C (2017) Tricornet: A hybrid temporal convolutional and recurrent network for video action segmentation. arXiv preprint arXiv:1705.07818"},{"key":"1166_CR6","doi-asserted-by":"crossref","unstructured":"Lin J, Gan C, Han S (2019) Tsm: Temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF international conference on computer vision (ICCV), IEEE, pp 7083\u20137093","DOI":"10.1109\/ICCV.2019.00718"},{"key":"1166_CR7","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. arXiv preprint arXiv:1406.2199"},{"key":"1166_CR8","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision (ICCV), IEEE, pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"1166_CR9","doi-asserted-by":"crossref","unstructured":"Singh B, Marks TK, Jones M, Tuzel O, Shao M (2016) A multi-stream bi-directional recurrent neural network for fine-grained action detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 1961\u20131970","DOI":"10.1109\/CVPR.2016.216"},{"key":"1166_CR10","doi-asserted-by":"crossref","unstructured":"Lea C, Flynn MD, Vidal R, Reiter A, Hager GD (2017) Temporal convolutional networks for action segmentation and detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 156\u2013165","DOI":"10.1109\/CVPR.2017.113"},{"key":"1166_CR11","doi-asserted-by":"crossref","unstructured":"Farha YA, Gall J (2019) Ms-tcn: Multi-stage temporal convolutional network for action segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 3575\u20133584","DOI":"10.1109\/CVPR.2019.00369"},{"key":"1166_CR12","doi-asserted-by":"crossref","unstructured":"Wang Z, Gao Z, Wang L, Li Z, Wu G (2020) Boundary-aware cascade networks for temporal action segmentation. In: Proceedings of the European conference on computer vision (ECCV), Springer, pp 34\u201351","DOI":"10.1007\/978-3-030-58595-2_3"},{"key":"1166_CR13","doi-asserted-by":"crossref","unstructured":"Ishikawa Y, Kasai S, Aoki Y, Kataoka H (2021) Alleviating over-segmentation errors by detecting action boundaries. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision (WACV), IEEE, pp 2322\u20132331","DOI":"10.1109\/WACV48630.2021.00237"},{"key":"1166_CR14","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3021756","author":"SJ Li","year":"2020","unstructured":"Li SJ, Abufarha Y, Liu Y, Cheng MM, Gall J (2020) Ms-tcn++: Multi-stage temporal convolutional network for action segmentation. IEEE Trans Pattern Anal. https:\/\/doi.org\/10.1109\/TPAMI.2020.3021756","journal-title":"IEEE Trans Pattern Anal"},{"key":"1166_CR15","doi-asserted-by":"crossref","unstructured":"Chen MH, Li B, Bao Y, Alregib G, Kira Z (2020) Action segmentation with joint self-supervised temporal domain adaptation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 9454\u20139463","DOI":"10.1109\/CVPR42600.2020.00947"},{"key":"1166_CR16","doi-asserted-by":"crossref","unstructured":"Wang D, Hu D, Li X, Dou D (2021) Temporal Relational Modeling with Self-Supervision for Action Segmentation. In: Proceedings of the aaai conference on artificial intelligence (AAAI). 35(4), pp 2729\u20132737","DOI":"10.1609\/aaai.v35i4.16377"},{"key":"1166_CR17","doi-asserted-by":"crossref","unstructured":"Stein S, Mckenna SJ (2013) Combining embedded accelerometers with computer vision for recognizing food preparation activities. In: Proceedings of the 2013 ACM international joint conference on Pervasive and ubiquitous computing, pp 729\u2013738","DOI":"10.1145\/2493432.2493482"},{"key":"1166_CR18","doi-asserted-by":"crossref","unstructured":"Fathi A, Ren X, Rehg JM (2011) Learning to recognize objects in egocentric activities. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 3281\u20133288","DOI":"10.1109\/CVPR.2011.5995444"},{"key":"1166_CR19","doi-asserted-by":"crossref","unstructured":"Kuehne H, Arslan A, Serre T (2014) The language of actions: Recovering the syntax and semantics of goal-directed human activities. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 780\u2013787","DOI":"10.1109\/CVPR.2014.105"},{"key":"1166_CR20","unstructured":"Oord AVD, Dieleman S, Zen H, Simonyan K, Vinyals O, Graves A, Kalchbrenner N, Senior A, Kavukcuoglu K (2016) Wavenet: A generative model for raw audio. arXiv preprint arXiv:1609.03499."},{"key":"1166_CR21","doi-asserted-by":"crossref","unstructured":"Lei P, Todorovic S (2018) Temporal deformable residual networks for action segmentation in videos. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition (CVPR), IEEE, pp 6742\u20136751","DOI":"10.1109\/CVPR.2018.00705"},{"key":"1166_CR22","doi-asserted-by":"crossref","unstructured":"Zhang Y, Tang S, Muandet K, Jarvers C, Neumann H (2019) Local temporal bilinear pooling for fine-grained action parsing. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 12005\u201312015","DOI":"10.1109\/CVPR.2019.01228"},{"key":"1166_CR23","doi-asserted-by":"publisher","first-page":"63","DOI":"10.1016\/j.neucom.2020.03.066","volume":"407","author":"D Wang","year":"2020","unstructured":"Wang D, Yuan Y, Wang Q (2020) Gated forward refinement network for action segmentation. Neurocomputing 407:63\u201371","journal-title":"Neurocomputing"},{"key":"1166_CR24","doi-asserted-by":"crossref","unstructured":"Huang Y, Sugano Y, Sato Y (2020) Improving action segmentation via graph-based temporal reasoning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 14024\u201314034","DOI":"10.1109\/CVPR42600.2020.01404"},{"key":"1166_CR25","doi-asserted-by":"crossref","unstructured":"Chen MH, Li B, Bao Y, Alregib G (2020) Action segmentation with mixed temporal domain adaptation. In: Proceedings of the IEEE\/CVF Winter conference on applications of computer vision (WACV), IEEE, pp 605\u2013614","DOI":"10.1109\/WACV45572.2020.9093535"},{"key":"1166_CR26","doi-asserted-by":"crossref","unstructured":"Gao SH, Han Q, Li ZY, Peng P, Wang L, Cheng MM (2021) Global2local: Efficient structure search for video action segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR), IEEE, pp 16805\u201316814","DOI":"10.1109\/CVPR46437.2021.01653"},{"key":"1166_CR27","doi-asserted-by":"crossref","unstructured":"Kitaev N, Cao S, Klein D (2018) Multilingual constituency parsing with self-attention and pre-training. arXiv preprint arXiv:1812.11760.","DOI":"10.18653\/v1\/P19-1340"},{"issue":"3","key":"1166_CR28","doi-asserted-by":"publisher","first-page":"1347","DOI":"10.1007\/s10044-021-00989-7","volume":"24","author":"X Cheng","year":"2021","unstructured":"Cheng X, Qiu G, Jiang Y, Zhu Z (2021) An improved small object detection method based on Yolo V3. Pattern Anal Appl 24(3):1347\u20131355","journal-title":"Pattern Anal Appl"},{"key":"1166_CR29","doi-asserted-by":"crossref","unstructured":"Kuehne H, Gall J, Serre T (2016) An end-to-end generative framework for video segmentation and recognition. In: Processing of the IEEE\/CVF Winter conference on applications of computer vision (WACV), IEEE, pp 1\u20138","DOI":"10.1109\/WACV.2016.7477701"},{"key":"1166_CR30","doi-asserted-by":"crossref","unstructured":"Arnab A, Dehghani M, Heigold G, Sun C, Lucic M, Schmid C (2021) Vivit: A video vision transformer. In: Proceedings of the IEEE\/CVF International conference on computer Vision (ICCV), IEEE, pp 6836\u20136846","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"1166_CR31","doi-asserted-by":"crossref","unstructured":"Zheng S, Lu J, Zhao H, Zhu X, Luo Z, Wang Y, Fu Y, Feng J, Xiang T, Torr PHS (2021) Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition (CVPR), IEEE, pp 6881\u20136890","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"1166_CR32","first-page":"5998","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30:5998\u20136008","journal-title":"Adv Neural Inf Process Syst"},{"issue":"4","key":"1166_CR33","doi-asserted-by":"publisher","first-page":"2128","DOI":"10.1007\/s10489-020-01933-8","volume":"51","author":"L He","year":"2021","unstructured":"He L, Wen S, Wang L, Li F (2021) Vehicle theft recognition from surveillance video based on spatiotemporal attention. Appl Intell 51(4):2128\u20132143","journal-title":"Appl Intell"},{"issue":"4","key":"1166_CR34","doi-asserted-by":"publisher","first-page":"1045","DOI":"10.1007\/s10489-019-01587-1","volume":"50","author":"J Wang","year":"2020","unstructured":"Wang J, Xiong H, Wang H, Nian X (2020) ADSCNet: asymmetric depthwise separable convolution for semantic segmentation in real-time. Appl Intell 50(4):1045\u20131056","journal-title":"Appl Intell"},{"key":"1166_CR35","doi-asserted-by":"crossref","unstructured":"Hu J, Shen L, Sun G (2018) Squeeze-and-excitation networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), IEEE, pp 7132\u20137141","DOI":"10.1109\/CVPR.2018.00745"},{"key":"1166_CR36","doi-asserted-by":"crossref","unstructured":"Woo S, Park J, Lee JY, Kweon IS (2018) Cbam: Convolutional block attention module. In: Proceedings of the European conference on computer vision (ECCV), Springer, pp 3\u201319","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"1166_CR37","doi-asserted-by":"crossref","unstructured":"Lin TY, Dollar P, Girshick R, He K, Hariharan B, Belongie S (2017) Feature pyramid networks for object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), IEEE, pp 2117\u20132125","DOI":"10.1109\/CVPR.2017.106"},{"key":"1166_CR38","doi-asserted-by":"crossref","unstructured":"Lin TY, Goyal P, Girshick R, He K, Dollar P (2017) Focal loss for dense object detection. In: Proceedings of the IEEE international conference on computer vision (ICCV), IEEE, pp 2980\u20132988","DOI":"10.1109\/ICCV.2017.324"},{"issue":"9","key":"1166_CR39","doi-asserted-by":"publisher","first-page":"1904","DOI":"10.1109\/TPAMI.2015.2389824","volume":"37","author":"K He","year":"2015","unstructured":"He K, Zhang X, Ren S, Sun J (2015) Spatial pyramid pooling in deep convolutional networks for visual recognition. IEEE Trans Pattern Anal Mach Intell 37(9):1904\u20131916","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1166_CR40","doi-asserted-by":"crossref","unstructured":"Tang K, Li FF, Koller D (2012) Learning latent temporal structure for complex event detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), IEEE, pp 1250\u20131257","DOI":"10.1109\/CVPR.2012.6247808"},{"issue":"8","key":"1166_CR41","first-page":"707","volume":"10","author":"VI Levenshtein","year":"1966","unstructured":"Levenshtein VI (1966) Binary codes capable of correcting deletions, insertions, and reversals. Soviet physics doklady 10(8):707\u2013710","journal-title":"Soviet physics doklady"},{"key":"1166_CR42","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on computer vision and pattern recognition (CVPR), IEEE, pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"1166_CR43","doi-asserted-by":"crossref","unstructured":"Donahue J, Anne Hendricks L, Guadarrama S et al (2015) Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), IEEE, pp 2625\u20132634","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"1166_CR44","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S et al (2015) Show and tell: A neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), IEEE, pp 3156\u20133164","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"1166_CR45","doi-asserted-by":"crossref","unstructured":"Tao L, Zappella L, Hager GD et al (2013) Surgical gesture segmentation and recognition. In: 2013 International conference on medical image computing and computer-assisted intervention (MICCAI), Springer, pp 339\u2013346","DOI":"10.1007\/978-3-642-40760-4_43"},{"key":"1166_CR46","doi-asserted-by":"crossref","unstructured":"Rohrbach M, Amin S, Andriluka M et al (2012) A database for fine grained activity detection of cooking activities. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), IEEE, pp 1194\u20131201","DOI":"10.1109\/CVPR.2012.6247801"},{"key":"1166_CR47","doi-asserted-by":"crossref","unstructured":"Cheng Y, Fan Q, Pankanti S et al (2014) Temporal sequence modeling for video event detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), IEEE, pp 2227\u20132234","DOI":"10.1109\/CVPR.2014.286"},{"key":"1166_CR48","doi-asserted-by":"crossref","unstructured":"Lea C, Reiter A, Vidal R, et al (2016) Segmental spatiotemporal cnns for fine-grained action segmentation. In: Proceedings of the European Conference on Computer Vision (ECCV), Springer, pp 36\u201352","DOI":"10.1007\/978-3-319-46487-9_3"},{"key":"1166_CR49","unstructured":"Zhang Y, Muandet K, Ma Q (2019) Frontal low-rank random tensors for fine-grained action segmentation. arXiv preprint arXiv:1906.01004."},{"key":"1166_CR50","doi-asserted-by":"crossref","unstructured":"Mac KNC, Joshi D, Yeh RA, Xiong J, Feris RS, Do MN (2019) Learning motion in feature space: locally-consistent deformable convolution networks for fine-grained action detection. In: Proceedings of the IEEE\/CVF International conference on computer vision (ICCV), IEEE, pp 6282\u20136291","DOI":"10.1109\/ICCV.2019.00638"},{"key":"1166_CR51","doi-asserted-by":"crossref","unstructured":"Richard A, Kuehne H, Gall J (2017) Weakly supervised action learning with rnn based fine-to-coarse modeling. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), IEEE, pp 754\u2013763","DOI":"10.1109\/CVPR.2017.140"},{"issue":"12","key":"1166_CR52","doi-asserted-by":"publisher","first-page":"9904","DOI":"10.1109\/TPAMI.2021.3132068","volume":"44","author":"Z Li","year":"2021","unstructured":"Li Z, Sun Y, Zhang L et al (2021) CTNet: context-based tandem network for semantic segmentation. IEEE Trans Pattern Anal Mach Intell 44(12):9904\u20139917","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"1166_CR53","doi-asserted-by":"crossref","unstructured":"Zhou H, Li Z, Ning C, et al (2017) Cad: Scale invariant framework for real-time object detection. In: Proceedings of the IEEE international conference on computer vision workshops, pp 760\u2013768","DOI":"10.1109\/ICCVW.2017.95"}],"container-title":["Pattern Analysis and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-023-01166-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10044-023-01166-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10044-023-01166-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,22]],"date-time":"2023-07-22T14:09:28Z","timestamp":1690034968000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10044-023-01166-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,15]]},"references-count":53,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2023,8]]}},"alternative-id":["1166"],"URL":"https:\/\/doi.org\/10.1007\/s10044-023-01166-8","relation":{},"ISSN":["1433-7541","1433-755X"],"issn-type":[{"value":"1433-7541","type":"print"},{"value":"1433-755X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,5,15]]},"assertion":[{"value":"30 March 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 April 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 May 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that have influenced the work reported in this manuscript.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing Interests"}},{"value":"All data included in this study are available upon request by contact with the corresponding author.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Data availability"}}]}}