{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:27:15Z","timestamp":1740122835532,"version":"3.37.3"},"reference-count":44,"publisher":"Springer Science and Business Media LLC","issue":"28-29","license":[{"start":{"date-parts":[[2021,3,6]],"date-time":"2021-03-06T00:00:00Z","timestamp":1614988800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,3,6]],"date-time":"2021-03-06T00:00:00Z","timestamp":1614988800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61572162"],"award-info":[{"award-number":["61572162"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61572251"],"award-info":[{"award-number":["61572251"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61802095"],"award-info":[{"award-number":["61802095"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61702144"],"award-info":[{"award-number":["61702144"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhejiang Provincial Key Science and Technology Project Foundation","award":["2018C01012"],"award-info":[{"award-number":["2018C01012"]}]},{"DOI":"10.13039\/501100004731","name":"Natural Science Foundation of Zhejiang Province","doi-asserted-by":"publisher","award":["LQ17F020003"],"award-info":[{"award-number":["LQ17F020003"]}],"id":[{"id":"10.13039\/501100004731","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2021,11]]},"DOI":"10.1007\/s11042-021-10633-5","type":"journal-article","created":{"date-parts":[[2021,3,6]],"date-time":"2021-03-06T08:02:31Z","timestamp":1615017751000},"page":"34973-34995","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Attention-based encoder-decoder networks for workflow recognition"],"prefix":"10.1007","volume":"80","author":[{"given":"Min","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6070-8524","authenticated-orcid":false,"given":"Haiyang","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Zhongjin","family":"Li","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Chen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,3,6]]},"reference":[{"unstructured":"Bahdanau D, Cho K, Bengio Y (2014) Neural machine translation by jointly learning to align and translate. arXiv:1409.0473","key":"10633_CR1"},{"doi-asserted-by":"crossref","unstructured":"Blum T, Feu\u00dfner H, Navab N (2010) Modeling and segmentation of surgical workflow from laparoscopic video. In: International conference on medical image computing and computer-assisted intervention, pp 400\u2013407","key":"10633_CR2","DOI":"10.1007\/978-3-642-15711-0_50"},{"doi-asserted-by":"crossref","unstructured":"Chao YW, Vijayanarasimhan S, Seybold B, Ross DA, Deng J, Sukthankar R (2018) Rethinking the faster r-cnn architecture for temporal action localization. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1130\u20131139","key":"10633_CR3","DOI":"10.1109\/CVPR.2018.00124"},{"issue":"1","key":"10633_CR4","doi-asserted-by":"publisher","first-page":"76","DOI":"10.1186\/s13640-018-0316-4","volume":"2018","author":"Y Chen","year":"2018","unstructured":"Chen Y, Sun Q L, Zhong K (2018) Semi-supervised spatio-temporal CNN for recognition of surgical workflow. EURASIP Journal on Image and Video Processing 2018(1):76","journal-title":"EURASIP Journal on Image and Video Processing"},{"doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li LJ, Li K, Fei-Fei L (2009) Imagenet: A large-scale hierarchical image database. In: IEEE Conference on computer vision and pattern recognition, pp 248\u2013255","key":"10633_CR5","DOI":"10.1109\/CVPR.2009.5206848"},{"doi-asserted-by":"crossref","unstructured":"Dogan E, Eren G, Wolf C, Baskurt A (2015) Activity recognition with volume motion templates and histograms of 3d gradients. In: 2015 IEEE International Conference on Image Processing (ICIP), pp 4421\u20134425","key":"10633_CR6","DOI":"10.1109\/ICIP.2015.7351642"},{"doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Pinz A, Zisserman A (2016) Convolutional two-stream network fusion for video action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1933\u20131941","key":"10633_CR7","DOI":"10.1109\/CVPR.2016.213"},{"unstructured":"Gorban A, Idrees H, Jiang Y G, Zamir A R, Laptev I, Shah M (2015) THUMOS challenge: Action recognition with a large number of classes","key":"10633_CR8"},{"key":"10633_CR9","doi-asserted-by":"publisher","first-page":"267","DOI":"10.1016\/j.patrec.2018.10.011","volume":"130","author":"H Hu","year":"2018","unstructured":"Hu H, Cheng K, Li Z, Chen J, Hu H (2018) Workflow recognition with structured two-stream convolutional networks. Pattern Recogn Lett 130:267\u2013274","journal-title":"Pattern Recogn Lett"},{"doi-asserted-by":"crossref","unstructured":"Jiang B, Wang M, Gan W, Wu W, Yan J (2019) STM: SpatioTemporal and motion encoding for action recognition. In: Proceedings of the IEEE international conference on computer vision, pp 2000\u20132009","key":"10633_CR10","DOI":"10.1109\/ICCV.2019.00209"},{"issue":"5","key":"10633_CR11","doi-asserted-by":"publisher","first-page":"1114","DOI":"10.1109\/TMI.2017.2787657","volume":"37","author":"Y Jin","year":"2017","unstructured":"Jin Y, Dou Q, Chen H, Yu L, Qin J, Fu C W, Heng P A (2017) SV-RCNEt: workflow recognition from surgical videos using recurrent convolutional network. IEEE Trans Medical Imag 37(5):1114\u20131126","journal-title":"IEEE Trans Medical Imag"},{"issue":"3","key":"10633_CR12","doi-asserted-by":"publisher","first-page":"422","DOI":"10.1016\/j.cviu.2011.09.006","volume":"116","author":"DI Kosmopoulos","year":"2012","unstructured":"Kosmopoulos D I, Doulamis N D, Voulodimos A S (2012) Bayesian filter based behavior recognition in workflows allowing for user feedback. Comput Vis Image Underst 116(3):422\u2013434","journal-title":"Comput Vis Image Underst"},{"doi-asserted-by":"crossref","unstructured":"Kulkarni A, Shivananda A (2019) Deep learning for NLP. In: Natural language processing recipes, pp 185\u2013227","key":"10633_CR13","DOI":"10.1007\/978-1-4842-4267-4_6"},{"issue":"4","key":"10633_CR14","doi-asserted-by":"publisher","first-page":"966","DOI":"10.1109\/TBME.2011.2181168","volume":"59","author":"F Lalys","year":"2011","unstructured":"Lalys F, Riffaud L, Bouget D, Jannin P (2011) A framework for the recognition of high-level surgical tasks from video images for cataract surgeries. IEEE Trans Biomed Eng 59(4):966\u2013976","journal-title":"IEEE Trans Biomed Eng"},{"doi-asserted-by":"crossref","unstructured":"Lan T, Wang Y, Mori G (2011) Discriminative figure-centric models for joint action localization and recognition. In: 2011 International conference on computer vision, pp 2003\u20132010","key":"10633_CR15","DOI":"10.1109\/ICCV.2011.6126472"},{"key":"10633_CR16","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1016\/j.cviu.2017.10.011","volume":"166","author":"Z Li","year":"2018","unstructured":"Li Z, Gavrilyuk K, Gavves E, Jain M, Snoek C G (2018) Videolstm convolves, attends and flows for action recognition. Comput Vis Image Underst 166:41\u201350","journal-title":"Comput Vis Image Underst"},{"doi-asserted-by":"crossref","unstructured":"Long F, Yao T, Qiu Z, Tian X, Luo J, Mei T (2019) Gaussian temporal awareness networks for action localization. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 344\u2013353","key":"10633_CR17","DOI":"10.1109\/CVPR.2019.00043"},{"unstructured":"Lu J, Corso JJ (2015) Human action segmentation with hierarchical supervoxel consistency. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3762\u20133771","key":"10633_CR18"},{"unstructured":"Lu J, Yang J, Batra D, Parikh D (2016) Hierarchical co-attention for visual question answering. In: Neural Information Processing Systems (NIPS), pp 2","key":"10633_CR19"},{"issue":"7","key":"10633_CR20","doi-asserted-by":"publisher","first-page":"1558","DOI":"10.1109\/TMM.2017.2659221","volume":"19","author":"Z Ma","year":"2017","unstructured":"Ma Z, Chang X, Yang Y, Sebe N, Hauptmann A G (2017) The many shades of negativity. IEEE Trans Multimed 19(7):1558\u20131568","journal-title":"IEEE Trans Multimed"},{"doi-asserted-by":"crossref","unstructured":"Makantasis K, Doulamis A, Doulamis N, Psychas K (2016) Deep learning based human behavior recognition in industrial workflows. In: 2016 IEEE International conference on image processing (ICIP), pp 1609\u20131613","key":"10633_CR21","DOI":"10.1109\/ICIP.2016.7532630"},{"issue":"2","key":"10633_CR22","doi-asserted-by":"publisher","first-page":"82","DOI":"10.1080\/13645706.2019.1584116","volume":"28","author":"N Padoy","year":"2019","unstructured":"Padoy N (2019) Machine and deep learning for workflow recognition during surgery. Minimally Invasive Therapy & Allied Technologies 28(2):82\u201390","journal-title":"Minimally Invasive Therapy & Allied Technologies"},{"doi-asserted-by":"crossref","unstructured":"Protopapadakis EE, Doulamis AD, Doulamis ND (2013) Tapped delay multiclass support vector machines for industrial workflow recognition. In: 2013 14th International workshop on image analysis for multimedia interactive services (WIAMIS), pp 1\u20134","key":"10633_CR23","DOI":"10.1109\/WIAMIS.2013.6616141"},{"unstructured":"Protopapadakis E, Doulamis A, Makantasis K, Voulodimos A (2012) A semi-supervised approach for industrial workflow recognition. In: Proceedings of the second international conference on advanced communications and computation, pp 21\u201326","key":"10633_CR24"},{"issue":"1-3","key":"10633_CR25","doi-asserted-by":"publisher","first-page":"17","DOI":"10.1080\/135062800394667","volume":"7","author":"RA Rensink","year":"2000","unstructured":"Rensink R A (2000) The dynamic representation of scenes. Vis Cogn 7(1-3):17\u201342","journal-title":"Vis Cogn"},{"unstructured":"Sharma S, Kiros R, Salakhutdinov R (2015) Action recognition using visual attention. arXiv:1511.04119","key":"10633_CR26"},{"doi-asserted-by":"crossref","unstructured":"Shou Z, Wang D, Chang SF (2016) Temporal action localization in untrimmed videos via multi-stage cnns. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1049\u20131058","key":"10633_CR27","DOI":"10.1109\/CVPR.2016.119"},{"doi-asserted-by":"crossref","unstructured":"Tao L, Zappella L, Hager GD, Vidal R (2013) Surgical gesture segmentation and recognition. In: International conference on medical image computing and computer-assisted intervention, pp 339\u2013346","key":"10633_CR28","DOI":"10.1007\/978-3-642-40760-4_43"},{"doi-asserted-by":"crossref","unstructured":"Thomay C, Gollan B, Haslgr\u00fcbler M, Ferscha A, Heftberger J (2019) A multi-sensor algorithm for activity and workflow recognition in an industrial setting. In: Proceedings of the 12th ACM international conference on pervasive technologies related to assistive environments, pp 69\u201376","key":"10633_CR29","DOI":"10.1145\/3316782.3321523"},{"doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp 4489\u20134497","key":"10633_CR30","DOI":"10.1109\/ICCV.2015.510"},{"doi-asserted-by":"crossref","unstructured":"Tran D, Wang H, Torresani L, Ray J, LeCun Y, Paluri M (2018) A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6450\u20136459","key":"10633_CR31","DOI":"10.1109\/CVPR.2018.00675"},{"issue":"6","key":"10633_CR32","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/TPAMI.2017.2712608","volume":"40","author":"G Varol","year":"2018","unstructured":"Varol G, Laptev I, Schmid C (2018) Long-term temporal convolutions for action recognition. IEEE Trans Pattern Anal Mach Intell 40(6):1510\u20131517","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"3","key":"10633_CR33","doi-asserted-by":"publisher","first-page":"42","DOI":"10.1109\/MMUL.2012.31","volume":"19","author":"A Voulodimos","year":"2012","unstructured":"Voulodimos A, Kosmopoulos D, Vasileiou G, Sardis E, Anagnostopoulos V, Lalos C, Varvarigou T (2012) A threefold dataset for activity and workflow recognition in complex industrial environments. IEEE MultiMedia 19(3):42\u201352","journal-title":"IEEE MultiMedia"},{"issue":"8","key":"10633_CR34","doi-asserted-by":"publisher","first-page":"852","DOI":"10.1016\/j.neunet.2011.06.001","volume":"24","author":"A Voulodimos","year":"2011","unstructured":"Voulodimos A, Kosmopoulos D, Veres G, Grabner H, Van Gool L, Varvarigou T (2011) Online classification of visual tasks for industrial workflow monitoring. Neural Netw 24(8):852\u2013860","journal-title":"Neural Netw"},{"issue":"2","key":"10633_CR35","first-page":"2","volume":"1","author":"L Wang","year":"2014","unstructured":"Wang L, Qiao Y, Tang X (2014) Action recognition and detection by combining motion and appearance features. THUMOS14 Action Recognition Challenge 1(2):2","journal-title":"THUMOS14 Action Recognition Challenge"},{"doi-asserted-by":"crossref","unstructured":"Wang H, Schmid C (2013) Action recognition with improved trajectories. In: Proceedings of the IEEE international conference on computer vision (ICCV), pp 3551\u20133558","key":"10633_CR36","DOI":"10.1109\/ICCV.2013.441"},{"issue":"11","key":"10633_CR37","doi-asserted-by":"publisher","first-page":"2740","DOI":"10.1109\/TPAMI.2018.2868668","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang L, Xiong Y, Wang Z, Qiao Y, Lin D, Tang X, Van Gool L (2018) Temporal segment networks for action recognition in videos. IEEE Trans Pattern Anal Mach Intell 41(11):2740\u20132755","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Bengio Y (2015) Show, attend and tell: Neural image caption generation with visual attention. In: International conference on machine learning, pp 2048\u20132057","key":"10633_CR38"},{"doi-asserted-by":"crossref","unstructured":"Xu H, Das A, Saenko K (2017) R-c3d: Region convolutional 3d network for temporal activity detection. In: Proceedings of the IEEE international conference on computer vision, pp 5783\u20135792","key":"10633_CR39","DOI":"10.1109\/ICCV.2017.617"},{"issue":"2","key":"10633_CR40","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1007\/s11263-014-0781-x","volume":"113","author":"Y Yang","year":"2015","unstructured":"Yang Y, Ma Z, Nie F, Chang X, Hauptmann A G (2015) Multi-class active learning by uncertainty sampling with diversity maximization. Int J Comput Vis 113(2):113\u2013127","journal-title":"Int J Comput Vis"},{"unstructured":"Zaremba W, Sutskever I, Vinyals O (2014) Recurrent neural network regularization. arXiv:1409.2329","key":"10633_CR41"},{"doi-asserted-by":"crossref","unstructured":"Zhang Q, Hua G (2015) Multi-view visual recognition of imperfect testing data. In: Proceedings of the 23rd ACM international conference on multimedia, pp 561\u2013570","key":"10633_CR42","DOI":"10.1145\/2733373.2806224"},{"unstructured":"Zhang L, Wang QW (2018) XIOLIFT database, https:\/\/pan.baidu.com\/s\/lySILNURWDN40q5TpAvGKUA","key":"10633_CR43"},{"doi-asserted-by":"crossref","unstructured":"Zhu W, Hu J, Sun G, Cao X, Qiao Y (2016) A key volume mining deep framework for action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1991\u20131999","key":"10633_CR44","DOI":"10.1109\/CVPR.2016.219"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-10633-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-021-10633-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-021-10633-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,11,30]],"date-time":"2021-11-30T17:34:46Z","timestamp":1638293686000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-021-10633-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,3,6]]},"references-count":44,"journal-issue":{"issue":"28-29","published-print":{"date-parts":[[2021,11]]}},"alternative-id":["10633"],"URL":"https:\/\/doi.org\/10.1007\/s11042-021-10633-5","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2021,3,6]]},"assertion":[{"value":"28 December 2019","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 December 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 February 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 March 2021","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}