{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:28:58Z","timestamp":1740122938498,"version":"3.37.3"},"reference-count":36,"publisher":"Springer Science and Business Media LLC","issue":"21-23","license":[{"start":{"date-parts":[[2021,7,20]],"date-time":"2021-07-20T00:00:00Z","timestamp":1626739200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,7,20]],"date-time":"2021-07-20T00:00:00Z","timestamp":1626739200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61771068"],"award-info":[{"award-number":["61771068"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2021,9]]},"DOI":"10.1007\/s11042-021-11093-7","type":"journal-article","created":{"date-parts":[[2021,7,20]],"date-time":"2021-07-20T00:02:48Z","timestamp":1626739368000},"page":"31821-31836","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Exploiting local spatio-temporal characteristics for effective video understanding"],"prefix":"10.1007","volume":"80","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0520-7807","authenticated-orcid":false,"given":"Tongcun","family":"Liu","sequence":"first","affiliation":[]},{"given":"Haoxin","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Yulong","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,7,20]]},"reference":[{"key":"11093_CR1","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Puerto Rico, USA, pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"11093_CR2","doi-asserted-by":"publisher","first-page":"30615","DOI":"10.1007\/s11042-020-09539-5","volume":"79","author":"T Do Carmo Nogueira","year":"2020","unstructured":"Do Carmo Nogueira T, Vinhal CDN, da Cruz J\u00fanior G, Ullmann MRD (2020) Reference-based model using multimodal gated recurrent units for image captioning. Multimed Tools Appl 79:30615\u201330635. https:\/\/doi.org\/10.1007\/s11042-020-09539-5","journal-title":"Multimed Tools Appl"},{"key":"11093_CR3","doi-asserted-by":"crossref","unstructured":"Donahue J, Hendricks LA, Guadarrama S et al (2015) Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Boston, Massachusetts, USA, pp 2625\u20132634","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"11093_CR4","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Pinz A, Zisserman A (2016) Convolutional two-stream network fusion for video action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Las Vegas, NV, USA, pp 1933\u20131941","DOI":"10.1109\/CVPR.2016.213"},{"key":"11093_CR5","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Pinz A, Wildes RP (2017) Spatiotemporal multiplier networks for video action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Puerto Rico, USA, pp 4768\u20134777","DOI":"10.1109\/CVPR.2017.787"},{"key":"11093_CR6","doi-asserted-by":"crossref","unstructured":"Gan C, Naiyan Wang, Yang Y et al (2015) DevNet: a deep event network for multimedia event detection and evidence recounting. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Boston, MA, USA, pp 2568\u20132577","DOI":"10.1109\/CVPR.2015.7298872"},{"key":"11093_CR7","doi-asserted-by":"publisher","unstructured":"Gao L, Li X, Song J, Shen HT (2019) Hierarchical LSTMs with Adaptive Attention for Visual Captioning. IEEE Trans Pattern Anal Mach Intell 1\u20131. https:\/\/doi.org\/10.1109\/TPAMI.2019.2894139","DOI":"10.1109\/TPAMI.2019.2894139"},{"key":"11093_CR8","doi-asserted-by":"crossref","unstructured":"Girdhar R, Ramanan D, Gupta A et al (2017) ActionVLAD: learning spatio-temporal aggregation for action classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Honolulu, HI, pp 3165\u20133174","DOI":"10.1109\/CVPR.2017.337"},{"key":"11093_CR9","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Las Vegas, NV, USA, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"11093_CR10","doi-asserted-by":"crossref","unstructured":"Heilbron FC, Escorcia V, Ghanem B, Niebles JC (2015) ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Boston, Massachusetts, USA, pp 961\u2013970","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"11093_CR11","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"sepp Hochreiter","year":"1997","unstructured":"Hochreiter sepp, Schmidhuber J (1997) Long short-term memory. Neural Comput 9:1735\u20131780. https:\/\/doi.org\/10.1162\/neco.1997.9.8.1735","journal-title":"Neural Comput"},{"key":"11093_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.cviu.2016.10.018","volume":"155","author":"H Idrees","year":"2017","unstructured":"Idrees H, Zamir AR, Jiang Y-G et al (2017) The THUMOS challenge on action recognition for videos \u201cin the Wild. Comput Vis Image Underst 155:1\u201323. https:\/\/doi.org\/10.1016\/j.cviu.2016.10.018","journal-title":"Comput Vis Image Underst"},{"key":"11093_CR13","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S et al (2014) Large-scale video classification with convolutional neural networks. In: Proceedings of the Conference on Computer Vision and Pattern Recognition. Columbus, OH, USA, pp 1725\u20131732","DOI":"10.1109\/CVPR.2014.223"},{"key":"11093_CR14","unstructured":"Khurram S, Amir Roshan Z, Mubarak S (2012) UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv:14091556 [cs]"},{"key":"11093_CR15","doi-asserted-by":"crossref","unstructured":"Kuehne H, Jhuang H, Garrote E et al (2011) HMDB: a large video database for human motion recognition. In: Proceedings of the International Conference on Computer Vision. Barcelona, Spain, pp 2556\u20132563","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"11093_CR16","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1016\/j.cviu.2017.10.011","volume":"166","author":"Z Li","year":"2018","unstructured":"Li Z, Gavrilyuk K, Gavves E et al (2018) VideoLSTM convolves, attends and flows for action recognition. Comput Vis Image Underst 166:41\u201350","journal-title":"Comput Vis Image Underst"},{"key":"11093_CR17","doi-asserted-by":"crossref","unstructured":"Lin J, Gan C, Han S (2019) TSM: temporal shift module for efficient video understanding. In: Proceedings of the IEEE International Conference on Computer Vision. Seoul, Korea, pp 7083\u20137093","DOI":"10.1109\/ICCV.2019.00718"},{"key":"11093_CR18","unstructured":"Ng JYue-Hei, Hausknecht M, Vijayanarasimhan S et al (2015) Beyond short snippets: deep networks for video classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Boston, MA, USA, pp 4694\u20134702"},{"key":"11093_CR19","doi-asserted-by":"publisher","first-page":"2263","DOI":"10.1007\/s11042-019-08113-y","volume":"79","author":"S Priyanka","year":"2020","unstructured":"Priyanka S (2020) Microstructure pattern extraction based image retrieval. Multimed Tools Appl 79:2263\u20132283. https:\/\/doi.org\/10.1007\/s11042-019-08113-y","journal-title":"Multimed Tools Appl"},{"key":"11093_CR20","doi-asserted-by":"crossref","unstructured":"Qiu Z, Yao T, Mei T (2017) Learning spatio-temporal representation with pseudo-3d residual networks. In: Proceedings of the IEEE International Conference on Computer Vision. Venice, Italy, pp 5534\u20135542","DOI":"10.1109\/ICCV.2017.590"},{"key":"11093_CR21","doi-asserted-by":"publisher","first-page":"1587","DOI":"10.1109\/TCSVT.2008.2005607","volume":"18","author":"J Shen","year":"2008","unstructured":"Shen J, Tao D, Li X (2008) Modality mixture projections for semantic video event detection. IEEE Trans Circuits Syst Video Technol 18:1587\u20131596. https:\/\/doi.org\/10.1109\/TCSVT.2008.2005607","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"11093_CR22","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: Advances in Neural Information Processing Systems. Curran Associates, Inc"},{"key":"11093_CR23","doi-asserted-by":"publisher","first-page":"3047","DOI":"10.1109\/TNNLS.2018.2851077","volume":"30","author":"J Song","year":"2019","unstructured":"Song J, Guo Y, Gao L et al (2019) From deterministic to generative: multimodal stochastic RNNs for video captioning. IEEE Trans Neural Netw Learn Syst 30:3047\u20133058. https:\/\/doi.org\/10.1109\/TNNLS.2018.2851077","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"11093_CR24","unstructured":"Srivastava N, Mansimov E, Salakhutdinov R (2015) Unsupervised learning of video representations using LSTMs. In: Proceedings of the 32nd International Conference on Machine Learning. Lille, France, pp 843\u2013852"},{"key":"11093_CR25","doi-asserted-by":"crossref","unstructured":"Sun Y, Wang X, Tang X (2015) Deeply learned face representations are sparse, selective, and robust. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Boston, MA, USA, pp 2892\u20132900","DOI":"10.1109\/CVPR.2015.7298907"},{"key":"11093_CR26","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R et al (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision. Santiago, Chile, pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"11093_CR27","unstructured":"Tran D, Ray J, Shou Z et al (2017) ConvNet architecture search for spatiotemporal feature learning. arXiv:170805038 [cs]"},{"key":"11093_CR28","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/TPAMI.2017.2712608","volume":"40","author":"G Varol","year":"2017","unstructured":"Varol G, Laptev I, Schmid C (2017) Long-term temporal convolutions for action recognition. IEEE Trans Pattern Anal Mach Intell 40:1510\u20131517","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"11093_CR29","doi-asserted-by":"crossref","unstructured":"Wang L, Xiong Y, Wang Z et al (2016) Temporal segment networks: towards good practices for deep action recognition. In: Proceedings of the 14th European Conference on Computer Vision. Amsterdam, pp 1\u201315","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"11093_CR30","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1109\/LSP.2016.2611485","volume":"24","author":"X Wang","year":"2017","unstructured":"Wang X, Gao L, Song J, Shen H (2017) Beyond frame-level CNN: saliency-aware 3-D CNN with LSTM for video action recognition. IEEE Signal Process Lett 24:510\u2013514. https:\/\/doi.org\/10.1109\/LSP.2016.2611485","journal-title":"IEEE Signal Process Lett"},{"key":"11093_CR31","doi-asserted-by":"publisher","first-page":"634","DOI":"10.1109\/TMM.2017.2749159","volume":"20","author":"X Wang","year":"2018","unstructured":"Wang X, Gao L, Wang P et al (2018) Two-stream 3-D convNet fusion for action recognition in videos with arbitrary size and length. IEEE Trans Multimedia 20:634\u2013644. https:\/\/doi.org\/10.1109\/TMM.2017.2749159","journal-title":"IEEE Trans Multimedia"},{"key":"11093_CR32","doi-asserted-by":"crossref","unstructured":"Wang X, Girshick R, Gupta A, He K (2018) Non-local neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. Salt Lake City, UT, USA, pp 7794\u20137803","DOI":"10.1109\/CVPR.2018.00813"},{"key":"11093_CR33","doi-asserted-by":"publisher","first-page":"3330","DOI":"10.1109\/TCYB.2019.2894498","volume":"50","author":"L Wang","year":"2020","unstructured":"Wang L, Qian X, Zhang Y et al (2020) Enhancing sketch-based image retrieval by CNN semantic re-ranking. IEEE Trans Cybern 50:3330\u20133342. https:\/\/doi.org\/10.1109\/TCYB.2019.2894498","journal-title":"IEEE Trans Cybern"},{"key":"11093_CR34","doi-asserted-by":"crossref","unstructured":"Yang C, Xu Y, Shi J et al (2020) Temporal Pyramid Network for Action Recognition. In: 2020 IEEE\/CVF Conference on Computer Vision, Recognition P (CVPR). IEEE, Seattle, WA, USA, pp 588\u2013597","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"11093_CR35","doi-asserted-by":"crossref","unstructured":"Zhou B, Andonian A, Oliva A, Torralba A (2018) Temporal Relational Reasoning in Videos. In: Ferrari V, Hebert M, Sminchisescu C, Weiss Y (eds) Proceedings of the European Conference on Computer Vision. Munich, Germany, pp 831\u2013846","DOI":"10.1007\/978-3-030-01246-5_49"},{"key":"11093_CR36","doi-asserted-by":"crossref","unstructured":"Zolfaghari M, Singh K, Brox T (2018) ECO: efficient convolutional network for online video understanding. In: Ferrari V, Hebert M, Sminchisescu C, Weiss Y (eds) Proceedings of the European Conference on Computer Vision. Munich, Germany, pp 713\u2013730","DOI":"10.1007\/978-3-030-01216-8_43"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-11093-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-021-11093-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-11093-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,10,10]],"date-time":"2021-10-10T04:33:19Z","timestamp":1633840399000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-021-11093-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,20]]},"references-count":36,"journal-issue":{"issue":"21-23","published-print":{"date-parts":[[2021,9]]}},"alternative-id":["11093"],"URL":"https:\/\/doi.org\/10.1007\/s11042-021-11093-7","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2021,7,20]]},"assertion":[{"value":"15 September 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 January 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 May 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 July 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}