{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T06:12:27Z","timestamp":1764828747313,"version":"build-2065373602"},"reference-count":51,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2020,4,27]],"date-time":"2020-04-27T00:00:00Z","timestamp":1587945600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,4,27]],"date-time":"2020-04-27T00:00:00Z","timestamp":1587945600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Nat Mach Intell"],"DOI":"10.1038\/s42256-020-0168-3","type":"journal-article","created":{"date-parts":[[2020,4,27]],"date-time":"2020-04-27T16:56:37Z","timestamp":1588006597000},"page":"245-253","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":17,"title":["Complex sequential understanding through the awareness of spatial and temporal concepts"],"prefix":"10.1038","volume":"2","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4521-6369","authenticated-orcid":false,"given":"Bo","family":"Pang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kaiwen","family":"Zha","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hanwen","family":"Cao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiajun","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minghui","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1533-8576","authenticated-orcid":false,"given":"Cewu","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,4,27]]},"reference":[{"key":"168_CR1","unstructured":"Graves, A. Generating sequences with recurrent neural networks. Preprint at https:\/\/arxiv.org\/abs\/1308.0850 (2013)."},{"key":"168_CR2","unstructured":"Sutskever, I., Vinyals, O. & Le, Q. V. Sequence to sequence learning with neural networks. In Annual Conference on Neural Information Processing Systems 3104\u20133112 (ACM, 2014)."},{"key":"168_CR3","unstructured":"Krizhevsky, A., Sutskever, I. & Hinton, G. ImageNet classification with deep convolutional neural networks. In Annual Conference on Neural Information Processing Systems 1097\u20131105 (ACM, 2012)."},{"key":"168_CR4","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S. & Sun, J. Deep residual learning for image recognition. In IEEE Conference on Computer Vision and Pattern Recognition 770\u2013778 (IEEE, 2016).","DOI":"10.1109\/CVPR.2016.90"},{"key":"168_CR5","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P. & Girshick, R. Mask R-CNN. In IEEE International Conference on Computer Vision 2980\u20132988 (IEEE, 2017).","DOI":"10.1109\/ICCV.2017.322"},{"key":"168_CR6","first-page":"1334","volume":"17","author":"S Levine","year":"2016","unstructured":"Levine, S., Finn, C., Darrell, T. & Abbeel, P. End-to-end training of deep visuomotor policies. J. Mach. Learn. Res. 17, 1334\u20131373 (2016).","journal-title":"J. Mach. Learn. Res."},{"key":"168_CR7","unstructured":"Schulman, J., Levine, S., Abbeel, P., Jordan, M. & Moritz, P. Trust region policy optimization. In International Conference on Machine Learning 1889\u20131897 (PMLR, 2015)."},{"key":"168_CR8","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J. & He, K. SlowFast networks for video recognition. In IEEE International Conference on Computer Vision 6202\u20136211 (IEEE, 2019).","DOI":"10.1109\/ICCV.2019.00630"},{"key":"168_CR9","doi-asserted-by":"crossref","unstructured":"Kim, J., El-Khamy, M. & Lee, J. Residual LSTM: design of a deep recurrent architecture for distant speech recognition. In Conference of the International Speech Communication Association 1591\u20131595 (ICSA, 2017).","DOI":"10.21437\/Interspeech.2017-477"},{"key":"168_CR10","doi-asserted-by":"publisher","first-page":"1317","DOI":"10.1016\/j.neuron.2015.08.036","volume":"87","author":"T Kitamura","year":"2015","unstructured":"Kitamura, T. et al. Entorhinal cortical ocean cells encode specific contexts and drive context-specific fear memory. Neuron 87, 1317\u20131331 (2015).","journal-title":"Neuron"},{"key":"168_CR11","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1007\/s00221-009-1834-1","volume":"195","author":"M Oliveri","year":"2009","unstructured":"Oliveri, M., Koch, G. & Caltagirone, C. Spatial-temporal interactions in the human brain. Exp. Brain Res. 195, 489\u2013497 (2009).","journal-title":"Exp. Brain Res."},{"key":"168_CR12","doi-asserted-by":"publisher","first-page":"260","DOI":"10.1038\/483260a","volume":"483","author":"D Wolman","year":"2012","unstructured":"Wolman, D. A tale of two halves. Nature 483, 260\u2013263 (2012).","journal-title":"Nature"},{"key":"168_CR13","doi-asserted-by":"publisher","DOI":"10.1038\/srep10532","volume":"5","author":"I Diez","year":"2015","unstructured":"Diez, I. et al. A novel brain partition highlights the modular skeleton shared by structure and function. Sci. Rep. 5, 10532 (2015).","journal-title":"Sci. Rep."},{"key":"168_CR14","unstructured":"Simonyan, K. & Zisserman, A. Two-stream convolutional networks for action recognition in videos. In Proceedings of the 27th International Conference on Neural Information Processing Systems 568\u2013576 (ACM, 2014)."},{"key":"168_CR15","unstructured":"Lucas, B. D. Generalized Image Matching by the Method of Differences. PhD thesis, Carnegie Mellon Univ. (1986)."},{"key":"168_CR16","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S. & Schmidhuber, J. Long short-term memory. Neural Comput. 9, 1735\u20131780 (1997).","journal-title":"Neural Comput."},{"key":"168_CR17","unstructured":"Soomro, K., Zamir, A. R. & Shah, M. UCF101: a dataset of 101 human actions classes from videos in the wild. Preprint at https:\/\/arxiv.org\/abs\/1212.0402 (2012)."},{"key":"168_CR18","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T. & Serre, T. HMDB: a large video database for human motion recognition. In IEEE International Conference on Computer Vision 2556\u20132563 (IEEE, 2011).","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"168_CR19","doi-asserted-by":"crossref","unstructured":"Carreira, J. & Zisserman, A. Quo vadis, action recognition? A new model and the kinetics dataset. In IEEE Conference on Computer Vision and Pattern Recognition 4724\u20134733 (IEEE, 2017).","DOI":"10.1109\/CVPR.2017.502"},{"key":"168_CR20","doi-asserted-by":"crossref","unstructured":"Donahue, J. et al. Long-term recurrent convolutional networks for visual recognition and description. In IEEE Conference on Computer Vision and Pattern Recognition 2625\u20132634 (IEEE, 2015).","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"168_CR21","unstructured":"Shi, X. et al. Convolutional LSTM network: a machine learning approach for precipitation nowcasting. In Annual Conference on Neural Information Processing Systems 802\u2013810 (ACM, 2015)."},{"key":"168_CR22","unstructured":"Simonyan, K. & Zisserman, A. Very deep convolutional networks for large-scale image recognition. In International Conference on Learning Representations (IEEE, 2015)."},{"key":"168_CR23","doi-asserted-by":"crossref","unstructured":"Szegedy, C. et al. Going deeper with convolutions. In IEEE Conference on Computer Vision and Pattern Recognition 1\u20139 (IEEE, 2015).","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"168_CR24","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji, S., Xu, W., Yang, M. & Yu, K. 3D convolutional neural networks for human action recognition. IEEE Trans. Pattern Analysis Mach. Intel. 35, 221\u2013231 (2013).","journal-title":"IEEE Trans. Pattern Analysis Mach. Intel."},{"key":"168_CR25","doi-asserted-by":"crossref","unstructured":"Cordts, M. et al. The cityscapes dataset for semantic urban scene understanding. In IEEE Conference on Computer Vision and Pattern Recognition 3213\u20133223 (IEEE, 2016).","DOI":"10.1109\/CVPR.2016.350"},{"key":"168_CR26","doi-asserted-by":"crossref","unstructured":"Castrejon, L., Kundu, K., Urtasun, R. & Fidler, S. Annotating object instances with a Polygon-RNN. In IEEE Conference on Computer Vision and Pattern Recognition 2 (IEEE, 2017).","DOI":"10.1109\/CVPR.2017.477"},{"key":"168_CR27","unstructured":"Santana, E. & Hotz, G. Learning a driving simulator. Preprint at https:\/\/arxiv.org\/abs\/1608.01230 (2016)."},{"key":"168_CR28","doi-asserted-by":"crossref","unstructured":"Chen, Y. et al. Lidar-video driving dataset: learning driving policies effectively. In IEEE Conference on Computer Vision and Pattern Recognition 5870\u20135878 (IEEE, 2018).","DOI":"10.1109\/CVPR.2018.00615"},{"key":"168_CR29","doi-asserted-by":"crossref","unstructured":"Karpathy, A. et al. Large-scale video classification with convolutional neural networks. In IEEE Conference Computer Vision and Pattern Recognition 1725\u20131732 (IEEE, 2014).","DOI":"10.1109\/CVPR.2014.223"},{"key":"168_CR30","unstructured":"Yue-Hei, N. J. et al. Beyond short snippets: deep networks for video classification. In IEEE Conference on Computer Vision and Pattern Recognition 4694\u20134702 (IEEE, 2015)."},{"key":"168_CR31","doi-asserted-by":"crossref","unstructured":"Wang, L., Qiao, Y., Tang, X. & Van, G. L. Actionness estimation using hybrid fully convolutional networks. IEEE Conference on Computer Vision and Pattern Recognition 2708\u20132717 (IEEE, 2016).","DOI":"10.1109\/CVPR.2016.296"},{"key":"168_CR32","doi-asserted-by":"crossref","unstructured":"Weinzaepfel, P., Harchaoui, Z. & Schmid, C. Learning to track for spatio-temporal action localization. In IEEE International Conference on Computer Vision 3164\u20133172 (IEEE, 2015).","DOI":"10.1109\/ICCV.2015.362"},{"key":"168_CR33","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C. & Liu, C. Action recognition by dense trajectories. In IEEE Conference on Computer Vision and Pattern Recognition 443\u2013455 (IEEE, 2011)."},{"key":"168_CR34","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1007\/s11263-012-0594-8","volume":"103","author":"H Wang","year":"2013","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C. & Liu, C. Dense trajectories and motion boundary descriptors for action recognition. Int. J. Comp. Vision 103, 60\u201379 (2013).","journal-title":"Int. J. Comp. Vision"},{"key":"168_CR35","doi-asserted-by":"crossref","unstructured":"Maji, S., Bourdev, L. & Malik, J. Action recognition from a distributed representation of pose and appearance. In IEEE Conference on Computer Vision and Pattern Recognition 3177\u20133184 (IEEE, 2011).","DOI":"10.1109\/CVPR.2011.5995631"},{"key":"168_CR36","doi-asserted-by":"crossref","unstructured":"Wu, Z., Wang, X., Jiang, Y., Ye, H. & Xue, X. Modeling spatial-temporal clues in a hybrid deep learning framework for video classification. In ACM International Conference on Multimedia 461\u2013470 (ACM, 2015).","DOI":"10.1145\/2733373.2806222"},{"key":"168_CR37","unstructured":"Srivastava, N., Mansimov, E. & Salakhudinov, R. Unsupervised learning of video representations using LSTMs. In International Conference on Machine Learning 843\u2013852 (PMLR, 2015)."},{"key":"168_CR38","doi-asserted-by":"crossref","unstructured":"Wu, C. et al. Long-term feature banks for detailed video understanding. In IEEE Conference on Computer Vision and Pattern Recognition 284\u2013293 (IEEE, 2019).","DOI":"10.1109\/CVPR.2019.00037"},{"key":"168_CR39","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Carreira, J., Doersch, C. & Zisserman, A. Video action transformer network. In IEEE Conference on Computer Vision and Pattern Recognition 244\u2013253 (IEEE, 2019).","DOI":"10.1109\/CVPR.2019.00033"},{"key":"168_CR40","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A. & Zisserman, A. Convolutional two-stream network fusion for video action recognition. In IEEE Conference on Computer Vision and Pattern Recognition 1933\u20131941 (IEEE, 2016).","DOI":"10.1109\/CVPR.2016.213"},{"key":"168_CR41","first-page":"1","volume":"5","author":"DE Rumelhart","year":"1988","unstructured":"Rumelhart, D. E. et al. Learning representations by back-propagating errors. Cognitive Modeling 5, 1 (1988).","journal-title":"Cognitive Modeling"},{"key":"168_CR42","doi-asserted-by":"publisher","first-page":"223","DOI":"10.1137\/16M1080173","volume":"60","author":"L Bottou","year":"2018","unstructured":"Bottou, L., Curtis, F. E. & Nocedal, J. Optimization methods for large-scale machine learning. SIAM Rev. 60, 223\u2013311 (2018).","journal-title":"SIAM Rev."},{"key":"168_CR43","doi-asserted-by":"publisher","first-page":"490","DOI":"10.1162\/neco.1990.2.4.490","volume":"2","author":"RJ Williams","year":"1990","unstructured":"Williams, R. J. & Peng, J. An efficient gradient-based algorithm for on-line training of recurrent network trajectories. Neural Comput. 2, 490\u2013501 (1990).","journal-title":"Neural Comput."},{"key":"168_CR44","doi-asserted-by":"crossref","unstructured":"Gu, C. et al. AVA: a video dataset of spatio-temporally localized atomic visual actions. In IEEE Conference on Computer Vision and Pattern Recognition 6047\u20136056 (IEEE, 2018).","DOI":"10.1109\/CVPR.2018.00633"},{"key":"168_CR45","doi-asserted-by":"crossref","unstructured":"Hou, R., Chen, C. & Shah, M. Tube convolutional neural network (T-CNN) for action detection in videos. In IEEE International Conference Computer Vision 5822\u20135831 (IEEE, 2017).","DOI":"10.1109\/ICCV.2017.620"},{"key":"168_CR46","doi-asserted-by":"crossref","unstructured":"Pang, B., Zha, K., Cao, H., Shi, C. & Lu, C. Deep RNN framework for visual sequential applications. In IEEE Conference on Computer Vision and Pattern Recognition 423\u2013432 (IEEE, 2019).","DOI":"10.1109\/CVPR.2019.00051"},{"key":"168_CR47","doi-asserted-by":"crossref","unstructured":"Song, S., Lan, C., Xing, J., Zeng, W. & Liu, J. An end-to-end spatio-temporal attention model for human action recognition from skeleton data. In AAAI Conference on Artificial Intelligence 4263\u20134270 (AAAI, 2017).","DOI":"10.1609\/aaai.v31i1.11212"},{"key":"168_CR48","doi-asserted-by":"crossref","unstructured":"Acuna, D., Ling, H., Kar, A & Fidler, S. Efficient interactive annotation of segmentation datasets with Polygon-RNN++. In IEEE Conference on Computer Vision and Pattern Recognition 859\u2013868 (IEEE, 2018).","DOI":"10.1109\/CVPR.2018.00096"},{"key":"168_CR49","unstructured":"Ioffe, S. & Szegedy, C. Batch normalization: accelerating deep network training by reducing internal covariate shift. In International Conference on Machine Learning 448\u2013456 (PMLR, 2015)."},{"key":"168_CR50","unstructured":"Kingma, D. & Ba, J. Adam: a method for stochastic optimization. In International Conference on Learning Representations (IEEE, 2015)."},{"key":"168_CR51","doi-asserted-by":"publisher","first-page":"1550","DOI":"10.1109\/5.58337","volume":"78","author":"PJ Werbos","year":"1990","unstructured":"Werbos, P. J. et al. Backpropagation through time: what it does and how to do it. Proc. IEEE 78, 1550\u20131560 (1990).","journal-title":"Proc. IEEE"}],"container-title":["Nature Machine Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s42256-020-0168-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-020-0168-3","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-020-0168-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,7]],"date-time":"2022-12-07T03:40:47Z","timestamp":1670384447000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s42256-020-0168-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,4,27]]},"references-count":51,"journal-issue":{"issue":"5","published-online":{"date-parts":[[2020,5]]}},"alternative-id":["168"],"URL":"https:\/\/doi.org\/10.1038\/s42256-020-0168-3","relation":{},"ISSN":["2522-5839"],"issn-type":[{"type":"electronic","value":"2522-5839"}],"subject":[],"published":{"date-parts":[[2020,4,27]]},"assertion":[{"value":"18 September 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 March 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 April 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}