{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,15]],"date-time":"2025-12-15T14:05:05Z","timestamp":1765807505384},"reference-count":51,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2017,12,18]],"date-time":"2017-12-18T00:00:00Z","timestamp":1513555200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61772359,61472275"],"award-info":[{"award-number":["61772359,61472275"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Tianjin Research Program of Application Foundation and Advanced Technology","award":["15JCYBJC16200"],"award-info":[{"award-number":["15JCYBJC16200"]}]},{"name":"National Research Foundation, Prime Minister Office, Singapore under its International Research Centre in Singapore Funding Initiative"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2019,1]]},"DOI":"10.1007\/s11042-017-5532-x","type":"journal-article","created":{"date-parts":[[2017,12,18]],"date-time":"2017-12-18T01:01:01Z","timestamp":1513558861000},"page":"677-695","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":32,"title":["LSTM-based multi-label video event detection"],"prefix":"10.1007","volume":"78","author":[{"given":"An-An","family":"Liu","sequence":"first","affiliation":[]},{"given":"Zhuang","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Yongkang","family":"Wong","sequence":"additional","affiliation":[]},{"given":"Junnan","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yu-Ting","family":"Su","sequence":"additional","affiliation":[]},{"given":"Mohan","family":"Kankanhalli","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,12,18]]},"reference":[{"key":"5532_CR1","first-page":"0473","volume":"1409","author":"D Bahdanau","year":"2014","unstructured":"Bahdanau D, Cho K, Bengio Y (2014) Neural machine translation by jointly learning to align and translate. CoRR 1409:0473","journal-title":"CoRR"},{"key":"5532_CR2","doi-asserted-by":"crossref","unstructured":"Benfold B, Reid ID (2011) Stable multi-target tracking in real-time surveillance video. In: IEEE Conference on computer vision and pattern recognition, pp 3457\u20133464","DOI":"10.1109\/CVPR.2011.5995667"},{"issue":"3","key":"5532_CR3","first-page":"27:1","volume":"2","author":"C Chang","year":"2011","unstructured":"Chang C, Lin C (2011) LIBSVM: A library for support vector machines. ACM TIST 2(3):27:1\u201327:27","journal-title":"ACM TIST"},{"key":"5532_CR4","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1016\/j.sigpro.2015.10.037","volume":"124","author":"Z Cheng","year":"2016","unstructured":"Cheng Z, Shen J (2016) On very large scale test collection for landmark image search benchmarking. Signal Process 124:13\u201326","journal-title":"Signal Process"},{"key":"5532_CR5","doi-asserted-by":"crossref","unstructured":"Cho K, van Merrienboer B, Bahdanau D, Bengio Y (2014) On the properties of neural machine translation: Encoder-decoder approaches. In: Proceedings of eighth workshop on syntax, semantics and structure in statistical translation, pp 103\u2013111","DOI":"10.3115\/v1\/W14-4012"},{"key":"5532_CR6","doi-asserted-by":"crossref","unstructured":"Cho K, van Merrienboer B, G\u00fcl\u00e7ehre \u00c7, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder-decoder for statistical machine translation. In: Proceedings of the 2014 conference on empirical methods in natural language processing, pp 1724\u20131734","DOI":"10.3115\/v1\/D14-1179"},{"key":"5532_CR7","doi-asserted-by":"crossref","unstructured":"Chu W, Song Y, Jaimes A (2015) Video co-summarization: Video summarization by visual co-occurrence. In: IEEE Conference on computer vision and pattern recognition, pp 3584\u20133592","DOI":"10.1109\/CVPR.2015.7298981"},{"issue":"8","key":"5532_CR8","doi-asserted-by":"publisher","first-page":"745","DOI":"10.1109\/TPAMI.2000.868676","volume":"22","author":"RT Collins","year":"2000","unstructured":"Collins RT, Biernacki C, Celeux G, Lipton AJ, Govaert G, Kanade T (2000) Introduction to the special section on video surveillance. IEEE Trans Pattern Anal Mach Intell 22(8):745\u2013746","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"5532_CR9","unstructured":"Dai J, Li Y, He K, Sun J (2016) R-fcn: Object detection via region-based fully convolutional networks. In: Advances in neural information processing systems, pp 379\u2013387"},{"key":"5532_CR10","doi-asserted-by":"crossref","unstructured":"Dalal N, Triggs B (2005) Histograms of oriented gradients for human detection. In: IEEE Conference on computer vision and pattern recognition, pp 886\u2013893","DOI":"10.1109\/CVPR.2005.177"},{"key":"5532_CR11","doi-asserted-by":"publisher","first-page":"428","DOI":"10.1007\/11744047_33","volume":"3952","author":"N Dalal","year":"2006","unstructured":"Dalal N, Triggs B, Schmid C (2006) Human detection using oriented histograms of flow and appearance. Lect Notes Comput Sci 3952:428\u2013441","journal-title":"Lect Notes Comput Sci"},{"issue":"4","key":"5532_CR12","doi-asserted-by":"publisher","first-page":"677","DOI":"10.1109\/TPAMI.2016.2599174","volume":"39","author":"J Donahue","year":"2017","unstructured":"Donahue J, Hendricks LA, Rohrbach M, Venugopalan S, Guadarrama S, Saenko K, Darrell T (2017) Long-term recurrent convolutional networks for visual recognition and description. IEEE Trans Pattern Anal Mach Intell 39(4):677\u2013691","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"5532_CR13","doi-asserted-by":"crossref","unstructured":"Fan C, Crandall DJ (2016) Deepdiary: Automatically captioning lifelogging image streams. In: European conference on computer vision workshops, pp 459\u2013473","DOI":"10.1007\/978-3-319-46604-0_33"},{"issue":"1","key":"5532_CR14","first-page":"113","volume":"87-D","author":"H Fujiyoshi","year":"2004","unstructured":"Fujiyoshi H, Lipton AJ, Kanade T (2004) Real-time human motion analysis by image skeletonization. IEICE, Transactions 87-D(1):113\u2013120","journal-title":"IEICE, Transactions"},{"key":"5532_CR15","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1016\/j.sigpro.2014.08.034","volume":"112","author":"Z Gao","year":"2015","unstructured":"Gao Z, Zhang H, Xu GP, Xue YB, Hauptmann AG (2015) Multi-view discriminative and structured dictionary learning with group sparsity for human action recognition. Signal Process 112:83\u201397","journal-title":"Signal Process"},{"key":"5532_CR16","unstructured":"Gong Y, Jia Y, Leung T, Toshev A, Ioffe S (2013) Deep convolutional ranking for multilabel image annotation. arXiv: 1312.4894"},{"key":"5532_CR17","doi-asserted-by":"crossref","unstructured":"Guo J, Ren T, Bei J, Zhu Y (2015) Salient object detection in RGB-d image based on saliency fusion and propagation. In: International conference on internet multimedia computing and service, pp 59:1\u201359:5","DOI":"10.1145\/2808492.2808551"},{"key":"5532_CR18","doi-asserted-by":"crossref","unstructured":"Gutchess D, Trajkovic M, Cohen-Solal E, Lyons DM, Jain AK (2001) A background model initialization algorithm for video surveillance. In: ICCV, pp 733\u2013740","DOI":"10.1109\/ICCV.2001.937598"},{"issue":"1","key":"5532_CR19","doi-asserted-by":"publisher","first-page":"57","DOI":"10.1109\/TKDE.2016.2611584","volume":"29","author":"X He","year":"2017","unstructured":"He X, Gao M, Kan M, Wang D (2017) Birank: Towards ranking on bipartite graphs. IEEE Trans Knowl Data Eng 29(1):57\u201371","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"5532_CR20","unstructured":"Hochreiter S, Schmidhuber J (1996) LSTM can solve hard long time lag problems. In: Advances in neural information processing systems, pp 473\u2013479"},{"key":"5532_CR21","doi-asserted-by":"crossref","unstructured":"Ibrahim MS, Muralidharan S, Deng Z, Vahdat A, Mori G (2016) A hierarchical deep temporal model for group activity recognition. In: IEEE Conference on computer vision and pattern recognition, pp 1971\u20131980","DOI":"10.1109\/CVPR.2016.217"},{"issue":"1","key":"5532_CR22","doi-asserted-by":"publisher","first-page":"221","DOI":"10.1109\/TPAMI.2012.59","volume":"35","author":"S Ji","year":"2013","unstructured":"Ji S, Xu W, Yang M, Yu K (2013) 3d convolutional neural networks for human action recognition. IEEE Trans Pattern Anal Mach Intell 35(1):221\u2013231","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"5532_CR23","doi-asserted-by":"crossref","unstructured":"Johnson J, Karpathy A, Fei-Fei L (2016) Densecap: Fully convolutional localization networks for dense captioning. In: IEEE Conference on computer vision and pattern recognition, pp 4565\u20134574","DOI":"10.1109\/CVPR.2016.494"},{"key":"5532_CR24","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Li F (2014) Large-scale video classification with convolutional neural networks. In: IEEE Conference on computer vision and pattern recognition, pp 1725\u20131732","DOI":"10.1109\/CVPR.2014.223"},{"key":"5532_CR25","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2012) Imagenet classification with deep convolutional neural networks. In: Advances in neural information processing systems, pp 1097\u20131105"},{"issue":"8","key":"5532_CR26","doi-asserted-by":"publisher","first-page":"1549","DOI":"10.1109\/TPAMI.2011.228","volume":"34","author":"T Lan","year":"2012","unstructured":"Lan T, Wang Y, Yang W, Robinovitch SN, Mori G (2012) Discriminative latent models for recognizing contextual group activities. IEEE Trans Pattern Anal Mach Intell 34(8):1549\u20131562","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"5532_CR27","doi-asserted-by":"crossref","unstructured":"Lazebnik S, Schmid C, Ponce J (2006) Beyond bags of features: Spatial pyramid matching for recognizing natural scene categories. In: IEEE Conference on computer vision and pattern recognition, pp 2169\u20132178","DOI":"10.1109\/CVPR.2006.68"},{"issue":"5","key":"5532_CR28","doi-asserted-by":"publisher","first-page":"827","DOI":"10.1109\/TPAMI.2005.102","volume":"27","author":"D Lee","year":"2005","unstructured":"Lee D (2005) Effective gaussian mixture learning for video background subtraction. IEEE Trans Pattern Anal Mach Intell 27(5):827\u2013832","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"5532_CR29","doi-asserted-by":"crossref","unstructured":"Li J, Wong Y, Kankanhalli MS (2016) Multi-stream deep learning framework for automated presentation assessment. In: IEEE International symposium on multimedia, pp 222\u2013225","DOI":"10.1109\/ISM.2016.0051"},{"issue":"6","key":"5532_CR30","doi-asserted-by":"publisher","first-page":"1194","DOI":"10.1109\/TCYB.2014.2347057","volume":"45","author":"A Liu","year":"2015","unstructured":"Liu A, Su Y, Jia P, Gao Z, Hao T, Yang Z (2015) Multipe\/single-view human action recognition via part-induced multitask structural learning. IEEE Trans Cybernetics 45(6):1194\u20131208","journal-title":"IEEE Trans Cybernetics"},{"issue":"1","key":"5532_CR31","doi-asserted-by":"publisher","first-page":"102","DOI":"10.1109\/TPAMI.2016.2537337","volume":"39","author":"A Liu","year":"2017","unstructured":"Liu A, Su Y, Nie W, Kankanhalli MS (2017) Hierarchical clustering multi-task learning for joint human action grouping and recognition. IEEE Trans Pattern Anal Mach Intell 39(1):102\u2013114","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"5532_CR32","doi-asserted-by":"crossref","unstructured":"Liu A, Xu N, Nie W, Su Y, Wong Y, Kankanhalli M (2017) Benchmarking a multi-modal & multi-view & interactive dataset for human action recognition. IEEE Trans Cybern","DOI":"10.1109\/TCYB.2016.2582918"},{"key":"5532_CR33","doi-asserted-by":"publisher","unstructured":"Ma S, Bargal SA, Zhang J, Sigal L, Sclaroff S (2017) Do less and achieve more: Training CNNs for action recognition utilizing action images from the web. Pattern Recognition. https:\/\/doi.org\/10.1016\/j.patcog.2017.01.027","DOI":"10.1016\/j.patcog.2017.01.027"},{"issue":"2","key":"5532_CR34","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1016\/j.jvcir.2007.04.002","volume":"19","author":"AG Money","year":"2008","unstructured":"Money AG, Agius HW (2008) Video summarisation: A conceptual framework and survey of the state of the art. J Vis Commun Image Represent 19(2):121\u2013143","journal-title":"J Vis Commun Image Represent"},{"issue":"2","key":"5532_CR35","doi-asserted-by":"publisher","first-page":"13:1","DOI":"10.1145\/2180868.2180875","volume":"30","author":"L Nie","year":"2012","unstructured":"Nie L, Wang M, Zha Z, Chua T (2012) Oracle in image search: A content-based approach to performance prediction. ACM Trans Inf Syst 30(2):13:1\u201313:23","journal-title":"ACM Trans Inf Syst"},{"key":"5532_CR36","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1016\/j.cviu.2016.03.013","volume":"150","author":"X Peng","year":"2016","unstructured":"Peng X, Wang L, Wang X, Qiao Y (2016) Bag of visual words and fusion methods for action recognition: Comprehensive study and good practice. Comput Vis Image Underst 150:109\u2013125","journal-title":"Comput Vis Image Underst"},{"issue":"11","key":"5532_CR37","doi-asserted-by":"publisher","first-page":"1369","DOI":"10.1016\/j.patrec.2010.03.024","volume":"31","author":"J Pers","year":"2010","unstructured":"Pers J, Sulic V, Kristan M, Perse M, Polanec K, Kovacic S (2010) Histograms of optical flow for efficient representation of body motion. Pattern Recogn Lett 31(11):1369\u20131376","journal-title":"Pattern Recogn Lett"},{"key":"5532_CR38","doi-asserted-by":"crossref","unstructured":"Pritch Y, Ratovitch S, Hendel A, Peleg S (2009) Clustered synopsis of surveillance video. In: IEEE International conference on advanced video and signal based surveillance, pp 195\u2013200","DOI":"10.1109\/AVSS.2009.53"},{"issue":"12","key":"5532_CR39","doi-asserted-by":"publisher","first-page":"2263","DOI":"10.1109\/TASLP.2016.2602884","volume":"24","author":"Y Qian","year":"2016","unstructured":"Qian Y, Bi M, Tan T, Yu K (2016) Very deep convolutional neural networks for noise robust speech recognition. IEEE\/ACM Trans Audio, Speech & Language Processing 24(12):2263\u20132276","journal-title":"IEEE\/ACM Trans Audio, Speech & Language Processing"},{"key":"5532_CR40","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: Advances in neural information processing systems, pp 568\u2013576"},{"key":"5532_CR41","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: Advances in neural information processing systems, pp 568\u2013576"},{"key":"5532_CR42","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition"},{"key":"5532_CR43","unstructured":"Sutskever I, Vinyals O, Le QV (2014) Sequence to sequence learning with neural networks. In: Advances in neural information processing systems"},{"issue":"1","key":"5532_CR44","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1145\/1198302.1198305","volume":"3","author":"BT Truong","year":"2007","unstructured":"Truong BT, Venkatesh S (2007) Video abstraction: A systematic review and classification. TOMCCAP 3(1):3","journal-title":"TOMCCAP"},{"issue":"2","key":"5532_CR45","doi-asserted-by":"publisher","first-page":"42","DOI":"10.1109\/MMUL.2014.29","volume":"21","author":"K Tu","year":"2014","unstructured":"Tu K, Meng M, Lee MW, Choe TE, Zhu SC (2014) Joint video and text parsing for understanding events and answering queries. IEEE Multimedia 21(2):42\u201370","journal-title":"IEEE Multimedia"},{"key":"5532_CR46","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Rohrbach M, Donahue J, Mooney RJ, Darrell T, Saenko K (2015) Sequence to sequence - video to text. In: IEEE International conference on computer vision, pp 4534\u20134542","DOI":"10.1109\/ICCV.2015.515"},{"key":"5532_CR47","doi-asserted-by":"crossref","unstructured":"Venugopalan S, Hendricks LA, Mooney RJ, Saenko K (2016) Improving lstm-based video description with linguistic knowledge mined from text. In: Proceedings of the 2016 conference on empirical methods in natural language processing, pp 1961\u20131966","DOI":"10.18653\/v1\/D16-1204"},{"issue":"3","key":"5532_CR48","doi-asserted-by":"publisher","first-page":"585","DOI":"10.1016\/S0031-3203(02)00100-0","volume":"36","author":"L Wang","year":"2003","unstructured":"Wang L, Hu W, Tan T (2003) Recent developments in human motion analysis. Pattern Recogn 36(3):585\u2013601","journal-title":"Pattern Recogn"},{"key":"5532_CR49","unstructured":"Yang Z, Yuan Y, Wu Y, Salakhutdinov R, Cohen WW (2016) Encode, review and decode: Reviewer module for caption generation"},{"key":"5532_CR50","unstructured":"Yeung S, Fathi A, Fei-Fei L (2014) Videoset: video summary evaluation through text. In: CVPR Egocentric vision workshop"},{"issue":"1","key":"5532_CR51","doi-asserted-by":"publisher","first-page":"1:1","DOI":"10.1145\/2978656","volume":"13","author":"H Zhang","year":"2016","unstructured":"Zhang H, Shang X, Luan H, Wang M, Chua T (2016) Learning from collective intelligence: Feature learning using social images and tags. ACM Trans Multimed Comput Commun Appl 13(1):1:1\u20131:23","journal-title":"ACM Trans Multimed Comput Commun Appl"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-017-5532-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-017-5532-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-017-5532-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,10,8]],"date-time":"2019-10-08T01:07:00Z","timestamp":1570496820000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-017-5532-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,12,18]]},"references-count":51,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2019,1]]}},"alternative-id":["5532"],"URL":"https:\/\/doi.org\/10.1007\/s11042-017-5532-x","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017,12,18]]},"assertion":[{"value":"24 May 2017","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 October 2017","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 December 2017","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 December 2017","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}