{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:26:54Z","timestamp":1740122814280,"version":"3.37.3"},"reference-count":34,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2017,1,25]],"date-time":"2017-01-25T00:00:00Z","timestamp":1485302400000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Neural Process Lett"],"published-print":{"date-parts":[[2017,8]]},"DOI":"10.1007\/s11063-017-9591-9","type":"journal-article","created":{"date-parts":[[2017,1,25]],"date-time":"2017-01-25T04:56:05Z","timestamp":1485320165000},"page":"313-328","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Capturing Temporal Structures for Video Captioning by Spatio-temporal Contexts and Channel Attention Mechanism"],"prefix":"10.1007","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6833-8882","authenticated-orcid":false,"given":"Dashan","family":"Guo","sequence":"first","affiliation":[]},{"given":"Wei","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xiangzhong","family":"Fang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,1,25]]},"reference":[{"issue":"2","key":"9591_CR1","doi-asserted-by":"crossref","first-page":"363","DOI":"10.1007\/s11063-015-9412-y","volume":"43","author":"J Azorin-Lopez","year":"2016","unstructured":"Azorin-Lopez J, Saval-Calvo M, Fuster-Guillo A, Garcia-Rodriguez J (2016) A novel prediction method for early recognition of global human behaviour in image sequences. Neural Process Lett 43(2):363\u2013387","journal-title":"Neural Process Lett"},{"key":"9591_CR2","unstructured":"Ballas N, Yao L, Pal C, Courville A (2015) Delving deeper into convolutional networks for learning video representations. arXiv preprint arXiv:1511.06432"},{"key":"9591_CR3","unstructured":"Chen DL, Dolan WB (2011) Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th annual meeting of the association for computational linguistics: human language technologies, Vol 1. Association for Computational Linguistics, pp 190\u2013200"},{"key":"9591_CR4","unstructured":"Chen X, Fang H, Lin TY, Vedantam R, Gupta S, Dollar P, Zitnick CL (2015) Microsoft coco captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325"},{"key":"9591_CR5","doi-asserted-by":"crossref","unstructured":"Cho K, Van\u00a0Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using rnn encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078","DOI":"10.3115\/v1\/D14-1179"},{"key":"9591_CR6","unstructured":"Chung J, Gulcehre C, Cho K, Bengio Y (2014) Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555"},{"key":"9591_CR7","doi-asserted-by":"crossref","unstructured":"Denkowski M, Lavie A (2014) Meteor universal: Language specific translation evaluation for any target language. In: Proceedings of the EACL 2014 workshop on statistical machine translation, vol\u00a06","DOI":"10.3115\/v1\/W14-3348"},{"key":"9591_CR8","doi-asserted-by":"crossref","unstructured":"Donahue J, Anne\u00a0Hendricks L, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T (2015) Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2625\u20132634","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"9591_CR9","unstructured":"Fernando B, Gould S (2016) Learning end-to-end video classification with rank-pooling. In: Proceedings of the 33rd international conference on machine learning, vol\u00a048. JMLR: W&CP, New York"},{"issue":"8","key":"9591_CR10","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"issue":"12","key":"9591_CR11","doi-asserted-by":"crossref","first-page":"5659","DOI":"10.1109\/TIP.2015.2487860","volume":"24","author":"C Hong","year":"2015","unstructured":"Hong C, Yu J, Wan J, Tao D, Wang M (2015) Multimodal deep autoencoder for human pose recovery. IEEE Trans Image Process 24(12):5659\u20135670","journal-title":"IEEE Trans Image Process"},{"key":"9591_CR12","doi-asserted-by":"crossref","first-page":"132","DOI":"10.1016\/j.sigpro.2015.10.004","volume":"124","author":"C Hong","year":"2016","unstructured":"Hong C, Chen X, Wang X, Tang C (2016) Hypergraph regularized autoencoder for image-based 3d human pose recovery. Signal Process 124:132\u2013140","journal-title":"Signal Process"},{"key":"9591_CR13","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1725\u20131732","DOI":"10.1109\/CVPR.2014.223"},{"key":"9591_CR14","doi-asserted-by":"crossref","unstructured":"Oneata D, Verbeek J, Schmid C (2013) Action and event recognition with fisher vectors on a compact feature set. In: Proceedings of the IEEE international conference on computer vision, pp 1817\u20131824","DOI":"10.1109\/ICCV.2013.228"},{"key":"9591_CR15","unstructured":"Pan P, Xu Z, Yang Y, Wu F, Zhuang Y (2015) Hierarchical recurrent neural encoder for video representation with application to captioning. arXiv preprint arXiv:1511.03476"},{"key":"9591_CR16","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ (2002) Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting on association for computational linguistics, Association for Computational Linguistics, pp 311\u2013318"},{"key":"9591_CR17","doi-asserted-by":"crossref","unstructured":"Peng X, Zou C, Qiao Y, Peng Q (2014) Action recognition with stacked fisher vectors. In: European conference on computer vision, Springer, Berlin, pp 581\u2013595","DOI":"10.1007\/978-3-319-10602-1_38"},{"issue":"2","key":"9591_CR18","doi-asserted-by":"crossref","first-page":"327","DOI":"10.1007\/s11063-015-9436-3","volume":"43","author":"B Rekabdar","year":"2016","unstructured":"Rekabdar B, Nicolescu M, Nicolescu M, Saffar MT, Kelley R (2016) A scale and translation invariant approach for early classification of spatio-temporal patterns using spiking neural networks. Neural Process Lett 43(2):327\u2013343","journal-title":"Neural Process Lett"},{"key":"9591_CR19","unstructured":"Simonyan K, Zisserman A (2014a) Two-stream convolutional networks for action recognition in videos. In: Advances in neural information processing systems, pp 568\u2013576"},{"key":"9591_CR20","unstructured":"Simonyan K, Zisserman A (2014b) Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556"},{"issue":"1","key":"9591_CR21","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava N, Hinton GE, Krizhevsky A, Sutskever I, Salakhutdinov R (2014) Dropout: a simple way to prevent neural networks from overfitting. J Mach Learn Res 15(1):1929\u20131958","journal-title":"J Mach Learn Res"},{"key":"9591_CR22","unstructured":"Srivastava N, Mansimov E, Salakhutdinov R (2015) Unsupervised learning of video representations using LSTMs. In: Proceedings of the 32nd international conference on machine learning, vol\u00a037. JMLR: W&CP, Lille"},{"key":"9591_CR23","doi-asserted-by":"crossref","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, Erhan D, Vanhoucke V, Rabinovich A (2015) Going deeper with convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"9591_CR24","unstructured":"Team TTD, Al-Rfou R, Alain G, Almahairi A, Angermueller C, Bahdanau D, Ballas N, Bastien F, Bayer J, Belikov A, et\u00a0al. (2016) Theano: a python framework for fast computation of mathematical expressions. arXiv preprint arXiv:1605.02688"},{"key":"9591_CR25","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2014) C3d: generic features for video analysis. CoRR, abs\/14120767 2:7"},{"key":"9591_CR26","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence\u00a0Zitnick C, Parikh D (2015) Cider: Consensus-based image description evaluation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"9591_CR27","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Rohrbach M, Donahue J, Mooney R, Darrell T, Saenko K (2015) Sequence to sequence\u2014video to text. In: Proceedings of the IEEE international conference on computer vision, pp 4534\u20134542","DOI":"10.1109\/ICCV.2015.515"},{"key":"9591_CR28","unstructured":"Venugopalan S, Xu H, Donahue J, Rohrbach M, Mooney R, Saenko K (2014) Translating videos to natural language using deep recurrent neural networks. arXiv preprint arXiv:1412.4729"},{"key":"9591_CR29","doi-asserted-by":"crossref","unstructured":"Wang H, Schmid C (2013) Action recognition with improved trajectories. In: Proceedings of the IEEE international conference on computer vision, pp 3551\u20133558","DOI":"10.1109\/ICCV.2013.441"},{"key":"9591_CR30","unstructured":"Xingjian S, Chen Z, Wang H, Yeung DY, Wong WK, Woo WC (2015) Convolutional LSTM network: a machine learning approach for precipitation nowcasting. In: Advances in neural information processing systems, pp 802\u2013881"},{"key":"9591_CR31","doi-asserted-by":"crossref","unstructured":"Yao L, Torabi A, Cho K, Ballas N, Pal C, Larochelle H, Courville A (2015) Describing videos by exploiting temporal structure. In: Proceedings of the IEEE international conference on computer vision, pp 4507\u20134515","DOI":"10.1109\/ICCV.2015.512"},{"key":"9591_CR32","unstructured":"Yeung S, Russakovsky O, Jin N, Andriluka M, Mori G, Fei-Fei L (2015) Every moment counts: Dense detailed labeling of actions in complex videos. arXiv preprint arXiv:1507.05738"},{"key":"9591_CR33","doi-asserted-by":"publisher","unstructured":"Yu J, Yang X, Gao F, Tao D (2016) Deep multimodal distance metric learning using click constraints for image ranking. IEEE Trans Cybern. doi: 10.1109\/TCYB.2016.2591583","DOI":"10.1109\/TCYB.2016.2591583"},{"key":"9591_CR34","unstructured":"Zeiler MD (2012) Adadelta: An adaptive learning rate method. arXiv preprint arXiv:1212.5701"}],"container-title":["Neural Processing Letters"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11063-017-9591-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-017-9591-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11063-017-9591-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,17]],"date-time":"2019-09-17T19:42:42Z","timestamp":1568749362000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11063-017-9591-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,1,25]]},"references-count":34,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2017,8]]}},"alternative-id":["9591"],"URL":"https:\/\/doi.org\/10.1007\/s11063-017-9591-9","relation":{},"ISSN":["1370-4621","1573-773X"],"issn-type":[{"type":"print","value":"1370-4621"},{"type":"electronic","value":"1573-773X"}],"subject":[],"published":{"date-parts":[[2017,1,25]]}}}