{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T05:15:17Z","timestamp":1773119717225,"version":"3.50.1"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2023,7,24]],"date-time":"2023-07-24T00:00:00Z","timestamp":1690156800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,7,24]],"date-time":"2023-07-24T00:00:00Z","timestamp":1690156800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Open Research Project of National Rail Transit Electrification and Automation Engineering Technology Research Center and Chengdu Guojia Electrical Engineering Co. , Ltd","award":["NEEC-2019-B06"],"award-info":[{"award-number":["NEEC-2019-B06"]}]},{"name":"Science and Technology Innovation Talent Project of Sichuan Province","award":["2021JDRC0012"],"award-info":[{"award-number":["2021JDRC0012"]}]},{"name":"Independent Research Project of National Key Laboratory of Traction Power of China","award":["2019TPL-T19"],"award-info":[{"award-number":["2019TPL-T19"]}]},{"name":"Key Interdisciplinary Basic Research Project of Southwest Jiaotong University","award":["2682021ZTPY089"],"award-info":[{"award-number":["2682021ZTPY089"]}]},{"name":"State Scholarship Fund of China Scholarship Council","award":["202007000101"],"award-info":[{"award-number":["202007000101"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["52277127"],"award-info":[{"award-number":["52277127"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2024,5]]},"DOI":"10.1007\/s00371-023-03018-2","type":"journal-article","created":{"date-parts":[[2023,7,24]],"date-time":"2023-07-24T18:02:32Z","timestamp":1690221752000},"page":"3163-3181","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["SiamMAST: Siamese motion-aware spatio-temporal network for video action recognition"],"prefix":"10.1007","volume":"40","author":[{"given":"Xuemin","family":"Lu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7926-9501","authenticated-orcid":false,"given":"Wei","family":"Quan","sequence":"additional","affiliation":[]},{"given":"Reformat","family":"Marek","sequence":"additional","affiliation":[]},{"given":"Haiquan","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Jim X.","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,7,24]]},"reference":[{"key":"3018_CR1","unstructured":"Krizhevsky, A., Sutskever, I, Hinton, G: ImageNet classification with deep convolutional neural networks. In: Advances in Neural Information Processing Systems (NIPs), pp. 1097\u20131105 (2012)"},{"key":"3018_CR2","unstructured":"Yu, F., Koltun, V.: Multi-scale context aggregation by dilated convolutions. arXiv preprint arXiv: 1511.07122 (2015)"},{"key":"3018_CR3","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3018_CR4","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Convolutional two-stream network fusion for video action recognition. In: Proceedings of 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1933\u20131941 (2016)","DOI":"10.1109\/CVPR.2016.213"},{"key":"3018_CR5","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos In Proceedings of the Advance Neural Information Processing System, pp. 568\u2013576 (2014)"},{"key":"3018_CR6","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Li, F.: Large-scale video classification with convolutional neural networks. In: 2014 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1725\u20131732 (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"3018_CR7","doi-asserted-by":"crossref","unstructured":"Ng, J., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R., Toderici, G.: Beyond short snippets: deep networks for video classification. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition, pp. 4694\u20134702 (2015)","DOI":"10.1109\/CVPR.2015.7299101"},{"key":"3018_CR8","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., Gool, L.: Temporal segment networks: towards good practices for deep action recognition. In: European Conference on Computer Vision (ECCV), pp. 20\u201336 (2016)","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"3018_CR9","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE Confernce in Computing Visual Pattern Recognition, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"3018_CR10","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Confernce in Computing Visual Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"issue":"2","key":"3018_CR11","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1007\/s11263-005-1838-7","volume":"64","author":"I Laptev","year":"2005","unstructured":"Laptev, I.: On space-time interest points. Int. J. Comput. Vis. 64(2), 107\u2013123 (2005)","journal-title":"Int. J. Comput. Vis."},{"issue":"10","key":"3018_CR12","doi-asserted-by":"publisher","first-page":"1383","DOI":"10.1007\/s00371-014-1020-8","volume":"31","author":"Y Li","year":"2015","unstructured":"Li, Y., Ye, J., Wang, T., Huang, S.: Augmenting bag-of-words: a robust contextual representation of spatiotemporal interest points for action recognition. Vis. Comput. 31(10), 1383\u20131394 (2015)","journal-title":"Vis. Comput."},{"issue":"3","key":"3018_CR13","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1007\/s00371-015-1066-2","volume":"32","author":"DD Dawn","year":"2016","unstructured":"Dawn, D.D., Shaikh, S.H.: A comprehensive survey of human action recognition with spatio-temporal interest point (STIP) detector. Vis. Comput. 32(3), 289\u2013306 (2016)","journal-title":"Vis. Comput."},{"issue":"3","key":"3018_CR14","doi-asserted-by":"publisher","first-page":"222","DOI":"10.1007\/s11263-013-0636-x","volume":"105","author":"J Sanchez","year":"2013","unstructured":"Sanchez, J., Perronnin, F., Mensink, T., Verbeek, J.: Image classification with the fisher vector: theory and practice. Int. J. Com. Vis. 105(3), 222\u2013245 (2013)","journal-title":"Int. J. Com. Vis."},{"issue":"9","key":"3018_CR15","doi-asserted-by":"publisher","first-page":"1704","DOI":"10.1109\/TPAMI.2011.235","volume":"34","author":"H Jegou","year":"2012","unstructured":"Jegou, H., Perronnin, F., Douze, M., S\u00e1nchez, J., P\u00e9rez, P., Schmid, C.: Aggregating local image descriptors into compact codes. IEEE Trans. Pattern Anal. Mach. Intell. 34(9), 1704\u20131716 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"34","key":"3018_CR16","first-page":"361","volume":"10","author":"V Thanikachalam","year":"2015","unstructured":"Thanikachalam, V., Thyagharajan, K.: Human action recognition using motion history image and correlation filter. Int. J. Appl. Eng. Res. 10(34), 361\u2013363 (2015)","journal-title":"Int. J. Appl. Eng. Res."},{"key":"3018_CR17","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Dai, Q., Xue, X., Liu, W., Ngo, CW.: Trajectory\u2010based modeling of human actions with motion reference points. In: European Conference on Computer Vision (ECCV), pp. 425\u2013438. Springer (2012)","DOI":"10.1007\/978-3-642-33715-4_31"},{"key":"3018_CR18","doi-asserted-by":"crossref","unstructured":"Sadanand, S., Corso, J.: A high\u2010level representation of activity in video. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1234\u20131241 (2012)","DOI":"10.1109\/CVPR.2012.6247806"},{"key":"3018_CR19","doi-asserted-by":"crossref","unstructured":"Dalal, N., Triggs, B., Schmid, C.: Human detection using oriented histograms of flow and appearance. In: European Conference on Computer Vision (ECCV), pp. 428\u2013441. Springer (2006)","DOI":"10.1007\/11744047_33"},{"key":"3018_CR20","doi-asserted-by":"crossref","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: 2013 IEEE conference on computer vision (ICCV), pp. 3551\u20133558 (2013)","DOI":"10.1109\/ICCV.2013.441"},{"key":"3018_CR21","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marszalek, M., Schmid, C., Rozenfeld, B.: Learning realistic human actions from movies. In: 2008 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1\u20138 (2008)","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"3018_CR22","doi-asserted-by":"publisher","first-page":"163","DOI":"10.1016\/j.neucom.2016.03.024","volume":"199","author":"N Raman","year":"2016","unstructured":"Raman, N., Maybank, S.: Activity recognition using a supervised non-parametric hierarchical HMM. Neurocomputing 199, 163\u2013177 (2016)","journal-title":"Neurocomputing"},{"key":"3018_CR23","doi-asserted-by":"crossref","unstructured":"Abidine, M., Fergani, B.: Evaluating C\u2010SVM, CRF and LDA classification for daily activity recognition. In: 2012 International Conference on Multimedia Computing and Systems, pp. 272\u2013277 (2012)","DOI":"10.1109\/ICMCS.2012.6320300"},{"key":"3018_CR24","doi-asserted-by":"crossref","unstructured":"Klaser, A., Marszalek, M., Schmid, C.: A spatio-temporal descriptor based on 3D-gradients, In: 2008 19th British Machine Vision Conference (BMVC), pp. 275\u20131 (2008)","DOI":"10.5244\/C.22.99"},{"key":"3018_CR25","doi-asserted-by":"crossref","unstructured":"Willems, G., Tuytelaars, T., Gool, L.: An efficient dense and scale-invariant spatio-temporal interest point detector. In: European Conference on Computer Vision (ECCV), pp. 650\u2013663. Springer (2008)","DOI":"10.1007\/978-3-540-88688-4_48"},{"key":"3018_CR26","doi-asserted-by":"crossref","unstructured":"Dollar, P., Rabaud, V., Cottrell, G., Belongie, S.: Behavior recognition via sparse spatio-temporal features. In: 2005 IEEE International Workshop on Visual Surveillance and Performance Evaluation of Tracking and Surveillance, pp. 65\u201372 (2005)","DOI":"10.1109\/VSPETS.2005.1570899"},{"key":"3018_CR27","unstructured":"Csurka, G., Dance, C., Fan, L., Willamowski, J., Bray, C.: Visual categorization with bags of keypoints. In: ECCV Workshop on statistical learning in computer vision, pp. 1\u201322 (2004)"},{"key":"3018_CR28","doi-asserted-by":"crossref","unstructured":"Cai, Z., Wang, L., Peng, X., Qiao, Y.: Multi-view super vector for action recognition. In: 2014 IEEE Conference on Computer Vision and Pattern Recognition, pp. 596\u2013603 (2014)","DOI":"10.1109\/CVPR.2014.83"},{"key":"3018_CR29","doi-asserted-by":"crossref","unstructured":"Wang, H., Klaser, A., Schmid, C., Liu, C.: Action recognition by dense trajectories. In: 2011 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3169\u2013317 (2016)","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"3018_CR30","doi-asserted-by":"crossref","unstructured":"Jain, M., Jegou, H., Bouthemy, P.: Better exploiting motion for better action recognition. In 2013 IEEE Conference on Computer Vision and Pattern Recognition, pp. 2555\u20132562 (2013)","DOI":"10.1109\/CVPR.2013.330"},{"key":"3018_CR31","doi-asserted-by":"crossref","first-page":"1327","DOI":"10.1007\/s00371-020-01868-8","volume":"37","author":"D Liang","year":"2021","unstructured":"Liang, D., Liang, H., Yu, Z., Zhang, Y.: Deep convolutional BiLSTM fusion network for facial expression recognition. Vis. Comput. 37, 1327\u20131341 (2021)","journal-title":"Vis. Comput."},{"key":"3018_CR32","doi-asserted-by":"crossref","unstructured":"Donahue, J., Hendricks, L., Guadarrama, S., Rohrbach M., Venugopalan S., Saenko K., Darrell T.: Long-term recurrent convolutional networks for visual recognition and description. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition, pp. 2625\u20132634 (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"3018_CR33","doi-asserted-by":"crossref","unstructured":"Wang, L., Qiao, Y., Tang, X.: Action recognition with trajectory-pooled deep-convolutional descriptors. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition, pp. 4305\u20134314 (2015)","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"3018_CR34","doi-asserted-by":"crossref","unstructured":"Gogi\u00b4c, I., Manhart, M., Pand\u017ei\u00b4c, l., Ahlberg, J.: Fast facial expression recognition using local binary features and shallow neural networks. Vis. Comput. 36, 97\u2013112 (2020)","DOI":"10.1007\/s00371-018-1585-8"},{"key":"3018_CR35","doi-asserted-by":"publisher","first-page":"1821","DOI":"10.1007\/s00371-020-01940-3","volume":"37","author":"A Abdelbaky","year":"2021","unstructured":"Abdelbaky, A., Aly, S.: Two-stream spatiotemporal feature fusion for human action recognition. Vis. Comput. 37, 1821\u20131835 (2021)","journal-title":"Vis. Comput."},{"key":"3018_CR36","unstructured":"Chan, T., Jia, K., Gao, S., Lu, J., Zeng, Z., Ma, Y.: PCANet: A Simple Deep Learning Baseline for Image Classification? arXiv preprint arXiv: 1404.3606v2 (2014)"},{"key":"3018_CR37","doi-asserted-by":"crossref","unstructured":"Tao, R., Gavves, E., Smeulders, A.: Siamese Instance Search for Tracking. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1420\u20131429 (2016)","DOI":"10.1109\/CVPR.2016.158"},{"key":"3018_CR38","unstructured":"Soomro, K., Zamir, A., Shah, M.: UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint: arXiv:1212.0402 (2012)"},{"key":"3018_CR39","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: A large video database for human motion recognition. In: 2011 IEEE Conference on Computer Vision (ICCV), pp. 2556\u20132563 (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"3018_CR40","doi-asserted-by":"crossref","unstructured":"Liu, H., Jie, Z., Jayashree, K., Qi, M., Jiang, J., Yan, S., Feng, J.: Video-based Person Re-identification with accumulative motion context. IEEE Trans Circuits Syst Video Technol 28(10):2788\u20132802 (2018)","DOI":"10.1109\/TCSVT.2017.2715499"},{"issue":"3","key":"3018_CR41","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Hua, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A., Li, F.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"3018_CR42","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1016\/j.cviu.2016.03.013","volume":"150","author":"X Peng","year":"2016","unstructured":"Peng, X., Wang, L., Wang, X., Qiao, Y.: Bag of visual words and fusion methods for action recognition: comprehensive study and good practice. Comput. Vis. Image Underst. 150, 109\u2013125 (2016)","journal-title":"Comput. Vis. Image Underst."},{"issue":"6","key":"3018_CR43","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/TPAMI.2017.2712608","volume":"40","author":"G Varol","year":"2017","unstructured":"Varol, G., Laptev, I., Schmid, C.: Long-term temporal convolutions for action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 40(6), 1510\u20131517 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3018_CR44","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: 2011 IEEE Conference on Computer Vision (ICCV), pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"3018_CR45","doi-asserted-by":"crossref","unstructured":"Bilen, H., Fernando, B., Gavves, E., Vedaldi, A., Gould, S.: Dynamic image networks for action recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3034\u20133042 (2016)","DOI":"10.1109\/CVPR.2016.331"},{"key":"3018_CR46","doi-asserted-by":"crossref","unstructured":"Zhu, W., Hu, J., Sun, G., Cao, X., Qiao, Y.: A key volume mining deep framework for action recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1991\u20131999 (2016)","DOI":"10.1109\/CVPR.2016.219"},{"key":"3018_CR47","unstructured":"Tran, D., Ray, J., Shou, Z., Chang, S., Paluri, M.: Convnet architecture search for spatiotemporal feature learning. arXiv preprint: arXiv:1708.05038 (2017)"},{"key":"3018_CR48","unstructured":"Diba, A., Fayyaz, M., Sharma, V., Karami, A., Arzani, M., Yousefzadeh, R., Gool, L.: Temporal 3d convnets: New architecture and transfer learning for video classification. arXiv preprint: arXiv:1711.08200 (2017)"},{"issue":"11","key":"3018_CR49","first-page":"2579","volume":"9","author":"L Maaten","year":"2008","unstructured":"Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11), 2579\u20132605 (2008)","journal-title":"J. Mach. Learn. Res."}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-023-03018-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-023-03018-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-023-03018-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,24]],"date-time":"2024-10-24T22:13:18Z","timestamp":1729807998000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-023-03018-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,24]]},"references-count":49,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2024,5]]}},"alternative-id":["3018"],"URL":"https:\/\/doi.org\/10.1007\/s00371-023-03018-2","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,7,24]]},"assertion":[{"value":"3 July 2023","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 July 2023","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}