{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T16:17:35Z","timestamp":1761581855280},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2019,7,25]],"date-time":"2019-07-25T00:00:00Z","timestamp":1564012800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,7,25]],"date-time":"2019-07-25T00:00:00Z","timestamp":1564012800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"2016 Guangzhou Innovation and Entrepreneurship Leader Team","award":["CXLJTD-201608"],"award-info":[{"award-number":["CXLJTD-201608"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2020,6]]},"DOI":"10.1007\/s00371-019-01733-3","type":"journal-article","created":{"date-parts":[[2019,7,25]],"date-time":"2019-07-25T07:33:11Z","timestamp":1564039991000},"page":"1261-1270","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":30,"title":["3D RANs: 3D Residual Attention Networks for action recognition"],"prefix":"10.1007","volume":"36","author":[{"given":"Jiahui","family":"Cai","sequence":"first","affiliation":[]},{"given":"Jianguo","family":"Hu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,7,25]]},"reference":[{"key":"1733_CR1","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"1733_CR2","doi-asserted-by":"publisher","first-page":"1143","DOI":"10.1007\/s00371-019-01692-9","volume":"35","author":"Y Li","year":"2019","unstructured":"Li, Y., Wang, Z., Yang, X., Wang, M., Poiana, S.I., Chaudhry, E., Zhang, J.: Efficient convolutional hierarchical autoencoder for human motion prediction. Vis. Comput. 35, 1143\u20131156 (2019)","journal-title":"Vis. Comput."},{"key":"1733_CR3","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4724\u20134733 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"1733_CR4","doi-asserted-by":"crossref","unstructured":"Wang, L., Qiao, Y., Tang, X.: Action recognition with trajectory-pooled deep-convolutional descriptors. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4305\u20134314 (2015)","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"1733_CR5","doi-asserted-by":"crossref","unstructured":"Wang, X., Farhadi, A., Gupta, A.: Actions$${}^\\sim $$ transformations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2658\u20132667 (2016)","DOI":"10.1109\/CVPR.2016.291"},{"key":"1733_CR6","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., Van Gool, L.: Temporal segment networks: towards good practices for deep action recognition. In: European Conference on Computer Vision, pp. 20\u201336 (2016)","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"1733_CR7","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1725\u20131732 (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"1733_CR8","doi-asserted-by":"crossref","unstructured":"Zhang, B., Wang, L., Wang, Z., Qiao, Y., Wang, H.: Real-time action recognition with enhanced motion vector CNNs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2718\u20132726 (2015)","DOI":"10.1109\/CVPR.2016.297"},{"key":"1733_CR9","doi-asserted-by":"crossref","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3551\u20133558 (2013)","DOI":"10.1109\/ICCV.2013.441"},{"key":"1733_CR10","doi-asserted-by":"crossref","unstructured":"Scovanner, P., Ali, S., Shah, M.: A 3-dimensional sift descriptor and its application to action recognition. In: Proceedings of the 15th ACM International Conference on Multimedia, pp. 357\u2013360 (2007)","DOI":"10.1145\/1291233.1291311"},{"key":"1733_CR11","doi-asserted-by":"crossref","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C., Liu, C. L.: Action recognition by dense trajectories. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3169\u20133176 (2011)","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"1733_CR12","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marszalek, M., Schmid, C., Rozenfeld, B.: Learning realistic human actions from movies. In: IEEE Conference on Computer Vision and Pattern Recognition. CVPR, pp. 1\u20138 (2008)","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"1733_CR13","first-page":"568","volume":"27","author":"K Simonyan","year":"2014","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. Adv. Neural Inf. Process. Syst. 27, 568\u2013576 (2014)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1733_CR14","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji, S., Xu, W., Yang, M., Yu, K.: 3D convolutional neural networks for human action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 35, 221\u2013231 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1733_CR15","unstructured":"Wang, Y., Jiang, L., Yang, M. H., Li, L. J., Long, M., Fei-Fei, L.: Eidetic 3D LSTM: A Model for Video Prediction and Beyond (2013)"},{"key":"1733_CR16","doi-asserted-by":"publisher","first-page":"32275","DOI":"10.1007\/s11042-018-6260-6","volume":"77","author":"Z Ma","year":"2018","unstructured":"Ma, Z., Sun, Z.: Time-varying LSTM networks for action recognition. Multimed. Tools Appl. 77, 32275\u201332285 (2018)","journal-title":"Multimed. Tools Appl."},{"key":"1733_CR17","doi-asserted-by":"publisher","unstructured":"Liang, D., Liang, H., Yu, Z., Zhang, Y.: Deep convolutional BiLSTM fusion network for facial expression recognition. Vis. Comput (2019). \nhttps:\/\/doi.org\/10.1007\/s00371-019-01636-3","DOI":"10.1007\/s00371-019-01636-3"},{"key":"1733_CR18","doi-asserted-by":"crossref","unstructured":"Hara, K., Kataoka, H., Satoh, Y.: Learning spatio-temporal features with 3D residual networks for action recognition. In: Proceedings of the ICCV Workshop on Action, Gesture, and Emotion Recognition, pp. 4 (2017)","DOI":"10.1109\/ICCVW.2017.373"},{"key":"1733_CR19","unstructured":"Nair, V., Hinton, G. E.: Rectified linear units improve restricted boltzmann machines. In: Proceedings of the 27th International Conference on Machine Learning (ICML-10), pp. 807\u2013814 (2010)"},{"key":"1733_CR20","unstructured":"Ba, J., Mnih, V., Kavukcuoglu, K.: Multiple object recognition with visual attention (2014)"},{"key":"1733_CR21","unstructured":"Mnih, V., Heess, N., Graves, A.: Recurrent models of visual attention. Adv, Neural Inf. Process. Syst. 27, 2204\u20132212 (2014)"},{"key":"1733_CR22","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: A dataset of 101 human actions classes from videos in the wild (2012)"},{"key":"1733_CR23","first-page":"571","volume":"12","author":"H Kuehne","year":"2013","unstructured":"Kuehne, H., Jhuang, H., Stiefelhagen, R., Serre, T.: Hmdb51: a large video database for human motion recognition. High Perform. Comput. Sci. Eng. 12, 571\u2013582 (2013)","journal-title":"High Perform. Comput. Sci. Eng."},{"key":"1733_CR24","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., Suleyman, M.: The kinetics human action video dataset (2017)"},{"key":"1733_CR25","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1733_CR26","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Convolutional two-stream network fusion for video action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1933\u20131941 (2016)","DOI":"10.1109\/CVPR.2016.213"},{"key":"1733_CR27","doi-asserted-by":"crossref","unstructured":"Li, W., Nie, W., Su, Y.: Human action recognition based on selected spatio-temporal features via bidirectional LSTM. In: IEEE Access, pp. 44211\u201344220 (2018)","DOI":"10.1109\/ACCESS.2018.2863943"},{"key":"1733_CR28","doi-asserted-by":"publisher","first-page":"3459","DOI":"10.1109\/TIP.2018.2818328","volume":"27","author":"S Song","year":"2018","unstructured":"Song, S., Lan, C., Xing, J., Zeng, W., Liu, J.: Spatio-temporal attention based LSTM networks for 3D action recognition and detection. IEEE Trans. Image Process. 27, 3459\u20133471 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"1733_CR29","doi-asserted-by":"publisher","first-page":"1254","DOI":"10.1109\/34.730558","volume":"11","author":"L Itti","year":"1998","unstructured":"Itti, L., Koch, C., Niebur, E.: A model of saliency-based visual attention for rapid scene analysis. IEEE Trans. Pattern Anal. Mach. Intell. 11, 1254\u20131259 (1998)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1733_CR30","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1080\/135062800394667","volume":"7","author":"RA Rensink","year":"2000","unstructured":"Rensink, R.A.: The dynamic representation of scenes. Vis. Cognit. 7, 17\u201342 (2000)","journal-title":"Vis. Cognit."},{"key":"1733_CR31","doi-asserted-by":"publisher","first-page":"201","DOI":"10.1038\/nrn755","volume":"3","author":"M Corbetta","year":"2002","unstructured":"Corbetta, M., Shulman, G.L.: Control of goal-directed and stimulus-driven attention in the brain. Nat. Rev. Neurosci. 3, 201 (2002)","journal-title":"Nat. Rev. Neurosci."},{"key":"1733_CR32","first-page":"1243","volume":"23","author":"H Larochelle","year":"2010","unstructured":"Larochelle, H., Hinton, G.E.: Learning to combine foveal glimpses with a third-order Boltzmann machine. Adv. Neural Inf. Process. Syst. 23, 1243\u20131251 (2010)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1733_CR33","doi-asserted-by":"crossref","unstructured":"Olshausen, B. A., Anderson, C. H., Van Essen, D. C.: A neurobiological model of visual attention and invariant pattern recognition based on dynamic routing of information. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4700\u20134719 (1993)","DOI":"10.1523\/JNEUROSCI.13-11-04700.1993"},{"key":"1733_CR34","doi-asserted-by":"crossref","unstructured":"Cao, C., Liu, X., Yang, Y., Yu, Y., Wang, J., Wang, Z., Ramanan, D.: Look and think twice: capturing top-down visual attention with feedback convolutional neural networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2956\u20132964 (2015)","DOI":"10.1109\/ICCV.2015.338"},{"key":"1733_CR35","unstructured":"Jaderberg, M., Simonyan, K., Zisserman, A.: Recurrent spatial transformer networks. In: Computer Science, (2015)"},{"key":"1733_CR36","doi-asserted-by":"crossref","unstructured":"Wang, F., Jiang, M., Qian, C., Yang, S., Li, C., Zhang, H., Tang, X.: Residual attention network for image classification. In: Computer Vision and Pattern Recognition, pp. 6450\u20136458 (2017)","DOI":"10.1109\/CVPR.2017.683"},{"key":"1733_CR37","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks (2017)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"1733_CR38","doi-asserted-by":"crossref","unstructured":"Woo, S., Park, J., Lee, J. Y., Kweon, I. S.: CBAM: Convolutional Block Attention Module. In: Proceedings of European Conference on Computer Vision (2018)","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"1733_CR39","unstructured":"Sharma, S., Kiros, R., Salakhutdinov, R.: Action recognition using visual attention. In: Computer Science (2015)"},{"key":"1733_CR40","doi-asserted-by":"crossref","unstructured":"Kim, D. , Cho, D. , Kweon, I. S.: Self-supervised video representation learning with space-time cubic puzzles. arXiv preprint \narXiv:1811.09795\n\n (2018)","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"1733_CR41","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"1733_CR42","unstructured":"Hinton, G. E.: Rectified linear units improve restricted Boltzmann machines Vinod Nair (2010)"},{"key":"1733_CR43","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2921\u20132929 (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"1733_CR44","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. pp. 448\u2013456 (2015)"},{"key":"1733_CR45","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y.: Towards good practices for very deep two-stream ConvNets. In: Computer Science (2015)"},{"key":"1733_CR46","doi-asserted-by":"crossref","unstructured":"Hara, K., Kataoka, H., Satoh, Y.: Can spatiotemporal 3D CNNs retrace the history of 2D CNNs and ImageNet?. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 18\u201322 (2017)","DOI":"10.1109\/CVPR.2018.00685"},{"key":"1733_CR47","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated Residual transformations for deep neural networks. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 5987\u20135995 (2017)","DOI":"10.1109\/CVPR.2017.634"},{"key":"1733_CR48","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Mei, T.: Learning spatio-temporal representation with Pseudo-3D residual networks. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 5534\u20135542 (2017)","DOI":"10.1109\/ICCV.2017.590"},{"key":"1733_CR49","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Sun, X., Zha, Z. J., Zeng, W.: MiCT: mixed 3D\/2D convolutional tube for human action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 449\u2013458 (2018)","DOI":"10.1109\/CVPR.2018.00054"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-019-01733-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s00371-019-01733-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-019-01733-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,7,23]],"date-time":"2020-07-23T23:12:09Z","timestamp":1595545929000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s00371-019-01733-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,7,25]]},"references-count":49,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2020,6]]}},"alternative-id":["1733"],"URL":"https:\/\/doi.org\/10.1007\/s00371-019-01733-3","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,7,25]]},"assertion":[{"value":"25 July 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}