{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T08:35:14Z","timestamp":1775205314502,"version":"3.50.1"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012182","type":"print"},{"value":"9783030012199","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01219-9_16","type":"book-chapter","created":{"date-parts":[[2018,10,6]],"date-time":"2018-10-06T14:23:51Z","timestamp":1538835831000},"page":"264-280","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":65,"title":["Diagnosing Error in Temporal Action Detectors"],"prefix":"10.1007","author":[{"given":"Humam","family":"Alwassel","sequence":"first","affiliation":[]},{"given":"Fabian","family":"Caba Heilbron","sequence":"additional","affiliation":[]},{"given":"Victor","family":"Escorcia","sequence":"additional","affiliation":[]},{"given":"Bernard","family":"Ghanem","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,7]]},"reference":[{"key":"16_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"253","DOI":"10.1007\/978-3-030-01240-3_16","volume-title":"ECCV 2018, Part IX","author":"H Alwassel","year":"2018","unstructured":"Alwassel, H., Caba Heilbron, F., Ghanem, B.: Action search: spotting actions in videos and its application to temporal action localization. In: Ferrari, V. (ed.) ECCV 2018, Part IX. LNCS, vol. 11213, pp. 253\u2013269. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01240-3_16"},{"key":"16_CR2","doi-asserted-by":"crossref","unstructured":"Buch, S., Escorcia, V., Shen, C., Ghanem, B., Niebles, J.C.: SST: single-stream temporal action proposals. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, pp. 6373\u20136382 (2017)","DOI":"10.1109\/CVPR.2017.675"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Barrios, W., Escorcia, V., Ghanem, B.: SCC: semantic context cascade for efficient action detection. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.338"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Escorcia, V., Ghanem, B., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR 2015, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"16_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"212","DOI":"10.1007\/978-3-030-01252-6_13","volume-title":"ECCV 2018, Part XI","author":"F Caba Heilbron","year":"2018","unstructured":"Caba Heilbron, F., Lee, J.Y., Jin, H., Ghanem, B.: What do I annotate next? An empirical study of active learning for action localization. In: Ferrari, V., et al. (eds.) ECCV 2018, Part XI. LNCS, vol. 11215, pp. 212\u2013229. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01252-6_13"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Niebles, J.C., Ghanem, B.: Fast temporal activity proposals for efficient detection of human actions in untrimmed videos. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, pp. 1914\u20131923 (2016)","DOI":"10.1109\/CVPR.2016.211"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, 21\u201326 July, 2017, pp. 4724\u20134733 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"16_CR8","doi-asserted-by":"crossref","unstructured":"Dai, X., Singh, B., Zhang, G., Davis, L.S., Chen, Y.Q.: Temporal context network for activity localization in videos. In: ICCV, pp. 5727\u20135736 (2017)","DOI":"10.1109\/ICCV.2017.610"},{"key":"16_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"768","DOI":"10.1007\/978-3-319-46487-9_47","volume-title":"Computer Vision \u2013 ECCV 2016","author":"V Escorcia","year":"2016","unstructured":"Escorcia, V., Caba Heilbron, F., Niebles, J.C., Ghanem, B.: DAPs: deep action proposals for action understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9907, pp. 768\u2013784. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46487-9_47"},{"key":"16_CR10","unstructured":"Escorcia, V., Dao, C.D., Jain, M., Ghanem, B., Snoek, C.: Guess where? Actor-supervision for spatiotemporal action localization. CoRR abs\/1804.01824 (2018)"},{"issue":"1","key":"16_CR11","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/s11263-014-0733-5","volume":"111","author":"M Everingham","year":"2015","unstructured":"Everingham, M., Eslami, S.M.A., Gool, L.J.V., Williams, C.K.I., Winn, J.M., Zisserman, A.: The Pascal visual object classes challenge: a retrospective. Int. J. Comput. Vis. IJCV 111(1), 98\u2013136 (2015)","journal-title":"Int. J. Comput. Vis. IJCV"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Gao, J., Yang, Z., Sun, C., Chen, K., Nevatia, R.: Turn tap: temporal unit regression network for temporal action proposals. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.392"},{"key":"16_CR13","unstructured":"Ghanem, B., et al.: ActivityNet challenge 2017 summary. CoRR abs\/1710.08011 (2017)"},{"issue":"1","key":"16_CR14","doi-asserted-by":"publisher","first-page":"142","DOI":"10.1109\/TPAMI.2015.2437384","volume":"38","author":"RB Girshick","year":"2016","unstructured":"Girshick, R.B., Donahue, J., Darrell, T., Malik, J.: Region-based convolutional networks for accurate object detection and segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 38(1), 142\u2013158 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"16_CR15","doi-asserted-by":"publisher","unstructured":"Goyal, R., et al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, 22\u201329 October 2017, pp. 5843\u20135851 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.622","DOI":"10.1109\/ICCV.2017.622"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Gu, C., et al.: AVA: a video dataset of spatio-temporally localized atomic visual actions. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018 (2018)","DOI":"10.1109\/CVPR.2018.00633"},{"key":"16_CR17","doi-asserted-by":"publisher","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, 27\u201330 June 2016, pp. 770\u2013778 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"16_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"340","DOI":"10.1007\/978-3-642-33712-3_25","volume-title":"Computer Vision \u2013 ECCV 2012","author":"D Hoiem","year":"2012","unstructured":"Hoiem, D., Chodpathumwan, Y., Dai, Q.: Diagnosing error in object detectors. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7574, pp. 340\u2013353. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33712-3_25"},{"key":"16_CR19","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.cviu.2016.10.018","volume":"155","author":"H Idrees","year":"2017","unstructured":"Idrees, H., et al.: The THUMOS challenge on action recognition for videos \u201cin the wild\u201d. Comput. Vis. Image Underst. 155, 1\u201323 (2017)","journal-title":"Comput. Vis. Image Underst."},{"key":"16_CR20","unstructured":"Jiang, Y.G., et al.: THUMOS challenge: action recognition with a large number of classes (2014). http:\/\/crcv.ucf.edu\/THUMOS14\/"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"16_CR22","unstructured":"Kay, W., et al.: The kinetics human action video dataset. CoRR abs\/1705.06950 (2017)"},{"key":"16_CR23","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. In: Pereira, F., Burges, C.J.C., Bottou, L., Weinberger, K.Q. (eds.) Advances in Neural Information Processing Systems 25, pp. 1097\u20131105. Curran Associates, Inc. (2012)"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Laptev, I., Marsza\u0142ek, M., Schmid, C., Rozenfeld, B.: Learning realistic human actions from movies. In: IEEE Conference on Computer Vision and Pattern Recognition (2008)","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"16_CR25","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Shou, Z.: Single shot temporal action detection. In: ACM on Multimedia Conference, MM 2017 (2017)","DOI":"10.1145\/3123266.3123343"},{"key":"16_CR26","unstructured":"Lin, T., Zhao, X., Shou, Z.: Temporal convolution based action proposal: submission to ActivityNet 2017. CoRR abs\/1707.06750 (2017)"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Moltisanti, D., Wray, M., Mayol-Cuevas, W.W., Damen, D.: Trespassing the boundaries: labeling temporal bounds for object interactions in egocentric video. In: IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, 22\u201329 October 2017, pp. 2905\u20132913 (2017)","DOI":"10.1109\/ICCV.2017.314"},{"key":"16_CR28","unstructured":"Monfort, M., et al.: Moments in time dataset: one million videos for event understanding"},{"key":"16_CR29","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems (NIPS) (2015)"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Ronchi, M.R., Perona, P.: Benchmarking and error diagnosis in multi-instance pose estimation. In: ICCV 2017, pp. 369\u2013378 (2017)","DOI":"10.1109\/ICCV.2017.48"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Russakovsky, O., Deng, J., Huang, Z., Berg, A.C., Li, F.: Detecting avocados to zucchinis: what have we done, and where are we going? ICCV 2013, pp. 2064\u20132071 (2013)","DOI":"10.1109\/ICCV.2013.258"},{"issue":"3","key":"16_CR32","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. IJCV 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis. IJCV"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Shou, Z., Chan, J., Zareian, A., Miyazawa, K., Chang, S.F.: CDC: convolutional-de-convolutional networks for precise temporal action localization in untrimmed videos. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.155"},{"key":"16_CR34","doi-asserted-by":"crossref","unstructured":"Shou, Z., Wang, D., Chang, S.F.: Temporal action localization in untrimmed videos via multi-stage CNNs. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.119"},{"key":"16_CR35","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Divvala, S., Farhadi, A., Gupta, A.: Asynchronous temporal fields for action recognition. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2017)","DOI":"10.1109\/CVPR.2017.599"},{"key":"16_CR36","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Russakovsky, O., Gupta, A.: What actions are needed for understanding human actions in videos? In: ICCV 2017, pp. 2156\u20132165 (2017)","DOI":"10.1109\/ICCV.2017.235"},{"key":"16_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"16_CR38","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: NIPS (2014)"},{"key":"16_CR39","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L.D., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"16_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"16_CR41","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R.B., Gupta, A., He, K.: Non-local neural networks. CoRR abs\/1711.07971 (2017)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"16_CR42","unstructured":"Xiong, Y., Zhao, Y., Wang, L., Lin, D., Tang, X.: A pursuit of temporal accuracy in general activity detection. CoRR abs\/1703.02716 (2017)"},{"key":"16_CR43","doi-asserted-by":"crossref","unstructured":"Xu, H., Das, A., Saenko, K.: R-C3D: region convolutional 3D network for temporal activity detection. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.617"},{"key":"16_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, S., Benenson, R., Omran, M., Hosang, J.H., Schiele, B.: How far are we from solving pedestrian detection? In: CVPR 2016, pp. 1259\u20131267 (2016)","DOI":"10.1109\/CVPR.2016.141"},{"key":"16_CR45","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xiong, Y., Wang, L., Wu, Z., Tang, X., Lin, D.: Temporal action detection with structured segment networks. In: ICCV 2017, October 2017","DOI":"10.1109\/ICCV.2017.317"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01219-9_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,6]],"date-time":"2022-10-06T01:09:11Z","timestamp":1665018551000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01219-9_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012182","9783030012199"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01219-9_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"7 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}