{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:51:38Z","timestamp":1778082698695,"version":"3.51.4"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319464923","type":"print"},{"value":"9783319464930","type":"electronic"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46493-0_9","type":"book-chapter","created":{"date-parts":[[2016,9,16]],"date-time":"2016-09-16T14:59:53Z","timestamp":1474037993000},"page":"137-153","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":107,"title":["Connectionist Temporal Modeling for Weakly Supervised Action Labeling"],"prefix":"10.1007","author":[{"given":"De-An","family":"Huang","sequence":"first","affiliation":[]},{"given":"Li","family":"Fei-Fei","sequence":"additional","affiliation":[]},{"given":"Juan Carlos","family":"Niebles","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,17]]},"reference":[{"issue":"11","key":"9_CR1","doi-asserted-by":"publisher","first-page":"2274","DOI":"10.1109\/TPAMI.2012.120","volume":"34","author":"R Achanta","year":"2012","unstructured":"Achanta, R., Shaji, A., Smith, K., Lucchi, A., Fua, P., Susstrunk, S.: Slic superpixels compared to state-of-the-art superpixel methods. IEEE Trans. Pattern Anal. Mach. Intell. 34(11), 2274\u20132282 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"9_CR2","unstructured":"Alayrac, J.B., Bojanowski, P., Agrawal, N., Sivic, J., Laptev, I., Lacoste-Julien, S.: Learning from narrated instruction videos (2015). arXiv preprint arXiv:1506.09215"},{"key":"9_CR3","doi-asserted-by":"crossref","unstructured":"Bojanowski, P., Bach, F., Laptev, I., Ponce, J., Schmid, C., Sivic, J.: Finding actors and actions in movies. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2280\u20132287 (2013)","DOI":"10.1109\/ICCV.2013.283"},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Bojanowski, P., Lagugie, R., Grave, E., Bach, F., Laptev, I., Ponce, J., Schmid, C.: Weakly-supervised alignment of video with text. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.507"},{"key":"9_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"628","DOI":"10.1007\/978-3-319-10602-1_41","volume-title":"Computer Vision \u2013 ECCV 2014","author":"P Bojanowski","year":"2014","unstructured":"Bojanowski, P., Lajugie, R., Bach, F., Laptev, I., Ponce, J., Schmid, C., Sivic, J.: Weakly supervised action labeling in videos under ordering constraints. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 628\u2013643. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-10602-1_41"},{"issue":"3","key":"9_CR6","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1961189.1961199","volume":"2","author":"Chih-Chung Chang","year":"2011","unstructured":"Chang, C.C., Lin, C.J.: LIBSVM: a library for support vector machines. ACM Trans. Intell. Syst. Technol. 2, 27:1\u201327:27 (2011). http:\/\/www.csie.ntu.edu.tw\/~cjlin\/libsvm","journal-title":"ACM Transactions on Intelligent Systems and Technology"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Chen, X., Zitnick, C.L.: Minds eye: a recurrent visual representation for image caption generation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"9_CR8","unstructured":"Donahue, J., Hendricks, L.A., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description (2014). arXiv preprint arXiv:1411.4389"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Duchenne, O., Laptev, I., Sivic, J., Bach, F., Ponce, J.: Automatic annotation of human actions in video. In: ICCV (2009)","DOI":"10.1109\/ICCV.2009.5459279"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Fernando, B., Gavves, E., Oramas, J.M., Ghodrati, A., Tuytelaars, T.: Modeling video evolution for action recognition. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299176"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., Malik, J.: Finding action tubes. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298676"},{"key":"9_CR12","unstructured":"Gorban, A., Idrees, H., Jiang, Y.G., Roshan Zamir, A., Laptev, I., Shah, M., Sukthankar, R.: THUMOS challenge: action recognition with a large number of classes (2015). http:\/\/www.thumos.info\/"},{"key":"9_CR13","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: ICML (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"9_CR14","doi-asserted-by":"crossref","unstructured":"Graves, A., Jaitly, N.: Towards end-to-end speech recognition with recurrent neural networks. In: ICML (2014)","DOI":"10.1109\/ICASSP.2013.6638947"},{"issue":"5","key":"9_CR15","doi-asserted-by":"publisher","first-page":"602","DOI":"10.1016\/j.neunet.2005.06.042","volume":"18","author":"A Graves","year":"2005","unstructured":"Graves, A., Schmidhuber, J.: Framewise phoneme classification with bidirectional LSTM and other neural network architectures. Neural Netw. 18(5), 602\u2013610 (2005)","journal-title":"Neural Netw."},{"key":"9_CR16","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: Activitynet: a large-scale video benchmark for human activity understanding. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"9_CR19","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Arslan, A., Serre, T.: The language of actions: recovering the syntax and semantics of goal-directed human activities. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.105"},{"key":"9_CR20","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Gall, J., Serre, T.: An end-to-end generative framework for video segmentation and recognition. In: WACV (2016)","DOI":"10.1109\/WACV.2016.7477701"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Lan, T., Zhu, Y., Zamir, A.R., Savarese, S.: Action recognition by hierarchical mid-level action elements. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.517"},{"key":"9_CR23","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marsza\u0142ek, M., Schmid, C., Rozenfeld, B.: Learning realistic human actions from movies. In: CVPR (2008)","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"9_CR24","doi-asserted-by":"crossref","unstructured":"Lillo, I., Soto, A., Niebles, J.C.: Discriminative hierarchical modeling of spatio-temporally composable human activities. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.109"},{"key":"9_CR25","doi-asserted-by":"crossref","unstructured":"Malmaud, J., Huang, J., Rathod, V., Johnston, N., Rabinovich, A., Murphy, K.: What\u2019s cookin\u2019? Interpreting Cooking Videos Using Text, Speech and Vision. In: NAACL (2015)","DOI":"10.3115\/v1\/N15-1015"},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Messing, R., Pal, C., Kautz, H.: Activity recognition using the velocity histories of tracked keypoints. In: CVPR (2009)","DOI":"10.1109\/ICCV.2009.5459154"},{"key":"9_CR27","unstructured":"Ng, J.Y.H., Hausknecht, M., Vijayanarasimhan, S., Vinyals, O., Monga, R., Toderici, G.: Beyond short snippets: deep networks for video classification (2015). arXiv preprint arXiv:1503.08909"},{"issue":"3","key":"9_CR28","doi-asserted-by":"publisher","first-page":"299","DOI":"10.1007\/s11263-007-0122-4","volume":"79","author":"JC Niebles","year":"2008","unstructured":"Niebles, J.C., Wang, H., Fei-Fei, L.: Unsupervised learning of human action categories using spatial-temporal words. Int. J. Comput. Vis. 79(3), 299\u2013318 (2008)","journal-title":"Int. J. Comput. Vis."},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"Pirsiavash, H., Ramanan, D.: Parsing videos of actions with segmental grammars. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.85"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Ramanathan, V., Joulin, A., Liang, P., Fei-Fei, L.: Linking peoplein videos with their names using conference resolution. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10590-1_7"},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Ramanathan, V., Tang, K., Mori, G., Fei-Fei, L.: Learning temporal embeddings for complex video analysis. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.508"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Amin, S., Andriluka, M., Schiele, B.: A database for fine grained activity detection of cooking activities. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6247801"},{"key":"9_CR33","first-page":"3","volume":"5","author":"DE Rumelhart","year":"1988","unstructured":"Rumelhart, D.E., Hinton, G.E., Williams, R.J.: Learning representations by back-propagating errors. Cogn. Model. 5, 3 (1988)","journal-title":"Cogn. Model."},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Ryoo, M.S., Aggarwal, J.K.: Spatio-temporal relationship match: video structure comparison for recognition of complex human activities. In: ICCV (2009)","DOI":"10.1109\/ICCV.2009.5459361"},{"key":"9_CR35","doi-asserted-by":"crossref","unstructured":"Sener, O., Zamir, A., Savarese, S., Saxena, A.: Unsupervised semantic parsing of video collections. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.509"},{"key":"9_CR36","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: NIPS (2014)"},{"key":"9_CR37","doi-asserted-by":"crossref","unstructured":"Song, Y., Morency, L.P., Davis, R.: Action recognition by hierarchical sequence summarization. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.457"},{"key":"9_CR38","unstructured":"Soomro, K., Roshan Zamir, A., Shah, M.: UCF101: A dataset of 101 human actions classes from videos in the wild. In: CRCV-TR-12-01 (2012)"},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Tang, K., Fei-Fei, L., Koller, D.: Learning latent temporal structure for complex event detection. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6247808"},{"key":"9_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"140","DOI":"10.1007\/978-3-642-15567-3_11","volume-title":"Computer Vision \u2013 ECCV 2010","author":"GW Taylor","year":"2010","unstructured":"Taylor, G.W., Fergus, R., LeCun, Y., Bregler, C.: Convolutional learning of spatio-temporal features. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010. LNCS, vol. 6316, pp. 140\u2013153. Springer, Heidelberg (2010). doi: 10.1007\/978-3-642-15567-3_11"},{"key":"9_CR41","unstructured":"Tieleman, T., Hinton, G.: Lecture 6.5-rmsprop: divide the gradient by a running average of its recent magnitude. COURSERA: Neural Netw. Mach. Learn. (2012)"},{"key":"9_CR42","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence-video to text. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"9_CR43","doi-asserted-by":"crossref","unstructured":"Vo, N.N., Bobick, A.F.: From stochastic grammar to bayes network: probabilistic parsing of complex activity. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.338"},{"key":"9_CR44","doi-asserted-by":"crossref","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.441"},{"key":"9_CR45","doi-asserted-by":"crossref","unstructured":"Wu, C., Zhang, J., Savarese, S., Saxena, A.: Watch-n-patch: unsupervised understanding of actions and relations. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299065"},{"key":"9_CR46","unstructured":"Xu, Z., Zhu, L., Yang, Y., Hauptmann, A.G.: Uts-cmu at THUMOS. CVPR THUMOS Challenge (2015)"},{"key":"9_CR47","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., Courville, A.: Video description generation incorporating spatio-temporal features and a soft-attention mechanism. In: ICCV (2015)"},{"key":"9_CR48","unstructured":"Yeung, S., Russakovsky, O., Jin, N., Andriluka, M., Mori, G., Fei-Fei, L.: Every moment counts: dense detailed labeling of actions in complex videos (2015). arXiv preprint arXiv:1507.05738"},{"key":"9_CR49","doi-asserted-by":"crossref","unstructured":"Yeung, S., Russakovsky, O., Mori, G., Fei-Fei, L.: End-to-end learning of action detection from frame glimpses in videos. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.293"},{"key":"9_CR50","doi-asserted-by":"crossref","unstructured":"Yu, S.I., Jiang, L., Hauptmann, A.: Instructional videos for unsupervised harvesting and learning of action examples. In: ACM Multimedia (2014)","DOI":"10.1145\/2647868.2654997"},{"key":"9_CR51","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Kiros, R., Zemel, R., Salakhutdinov, R., Urtasun, R., Torralba, A., Fidler, S.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2016"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46493-0_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,8]],"date-time":"2022-07-08T17:12:32Z","timestamp":1657300352000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-46493-0_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319464923","9783319464930"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46493-0_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"17 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.eccv2016.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}