{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,19]],"date-time":"2026-02-19T16:07:42Z","timestamp":1771517262927,"version":"3.50.1"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585167","type":"print"},{"value":"9783030585174","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58517-4_30","type":"book-chapter","created":{"date-parts":[[2020,10,9]],"date-time":"2020-10-09T19:03:11Z","timestamp":1602270191000},"page":"510-527","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["CFAD: Coarse-to-Fine Action Detector for Spatiotemporal Action Localization"],"prefix":"10.1007","author":[{"given":"Yuxi","family":"Li","sequence":"first","affiliation":[]},{"given":"Weiyao","family":"Lin","sequence":"additional","affiliation":[]},{"given":"John","family":"See","sequence":"additional","affiliation":[]},{"given":"Ning","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Shugong","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Ke","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Cong","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,10,10]]},"reference":[{"key":"30_CR1","unstructured":"Andrychowicz, M., et al.: Learning to learn by gradient descent by gradient descent. In: NeurIPS, pp. 3981\u20133989 (2016)"},{"key":"30_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1007\/978-3-540-24673-2_3","volume-title":"Computer Vision - ECCV 2004","author":"T Brox","year":"2004","unstructured":"Brox, T., Bruhn, A., Papenberg, N., Weickert, J.: High accuracy optical flow estimation based on a theory for warping. In: Pajdla, T., Matas, J. (eds.) ECCV 2004. LNCS, vol. 3024, pp. 25\u201336. Springer, Heidelberg (2004). https:\/\/doi.org\/10.1007\/978-3-540-24673-2_3"},{"key":"30_CR3","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, action recognition? A new model and the kinetics dataset. In: CVPR, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"30_CR4","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"30_CR5","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: CVPR, pp. 580\u2013587 (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"30_CR6","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., Malik, J.: Finding action tubes. In: CVPR, June 2015","DOI":"10.1109\/CVPR.2015.7298676"},{"key":"30_CR7","doi-asserted-by":"crossref","unstructured":"Gu, C., et al.: AVA: a video dataset of spatio-temporally localized atomic visual actions. In: CVPR, pp. 6047\u20136056 (2018)","DOI":"10.1109\/CVPR.2018.00633"},{"key":"30_CR8","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"30_CR9","doi-asserted-by":"crossref","unstructured":"Hou, R., Chen, C., Shah, M.: An end-to-end 3D convolutional neural network for action detection and segmentation in videos. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.620"},{"key":"30_CR10","doi-asserted-by":"crossref","unstructured":"Hu, R., Doll\u00e1r, P., He, K., Darrell, T., Girshick, R.: Learning to segment every thing. In: CVPR, pp. 4233\u20134241 (2018)","DOI":"10.1109\/CVPR.2018.00445"},{"key":"30_CR11","doi-asserted-by":"crossref","unstructured":"Huang, G., Liu, Z., Van Der Maaten, L., Weinberger, K.Q.: Densely connected convolutional networks. In: CVPR, pp. 4700\u20134708 (2017)","DOI":"10.1109\/CVPR.2017.243"},{"key":"30_CR12","doi-asserted-by":"crossref","unstructured":"Huang, J., Li, N., Zhong, J., Li, T.H., Li, G.: Online action tube detection via resolving the spatio-temporal context pattern. In: ACM MM, pp. 993\u20131001. ACM (2018)","DOI":"10.1145\/3240508.3240659"},{"key":"30_CR13","unstructured":"Jaderberg, M., Simonyan, K., Zisserman, A., et al.: Spatial transformer networks. In: NeurIPS, pp. 2017\u20132025 (2015)"},{"key":"30_CR14","doi-asserted-by":"crossref","unstructured":"Jhuang, H., Gall, J., Zuffi, S., Schmid, C., Black, M.J.: Towards understanding action recognition. In: ICCV, pp. 3192\u20133199, December 2013","DOI":"10.1109\/ICCV.2013.396"},{"key":"30_CR15","doi-asserted-by":"crossref","unstructured":"Kalogeiton, V., Weinzaepfel, P., Ferrari, V., Schmid, C.: Action tubelet detector for spatio-temporal action localization. In: ICCV, pp. 4405\u20134413 (2017)","DOI":"10.1109\/ICCV.2017.472"},{"key":"30_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"306","DOI":"10.1007\/978-3-030-01231-1_19","volume-title":"Computer Vision \u2013 ECCV 2018","author":"D Li","year":"2018","unstructured":"Li, D., Qiu, Z., Dai, Q., Yao, T., Mei, T.: Recurrent tubelet proposal and recognition networks for action detection. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 306\u2013322. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_19"},{"key":"30_CR17","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, Z., Wang, L., Wu, G.: Actions as moving points. arXiv preprint arXiv:2001.04608 (2020)","DOI":"10.1007\/978-3-030-58517-4_5"},{"key":"30_CR18","doi-asserted-by":"publisher","first-page":"41","DOI":"10.1016\/j.cviu.2017.10.011","volume":"166","author":"Z Li","year":"2018","unstructured":"Li, Z., Gavrilyuk, K., Gavves, E., Jain, M., Snoek, C.G.: VideoLSTM convolves, attends and flows for action recognition. Comput. Vis. Image Underst. 166, 41\u201350 (2018)","journal-title":"Comput. Vis. Image Underst."},{"key":"30_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"744","DOI":"10.1007\/978-3-319-46493-0_45","volume-title":"Computer Vision \u2013 ECCV 2016","author":"X Peng","year":"2016","unstructured":"Peng, X., Schmid, C.: Multi-region two-stream R-CNN for action detection. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 744\u2013759. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_45"},{"key":"30_CR20","doi-asserted-by":"crossref","unstructured":"Pramono, R.R.A., Chen, Y.T., Fang, W.H.: Hierarchical self-attention network for action localization in videos. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00015"},{"key":"30_CR21","doi-asserted-by":"crossref","unstructured":"Qiu, Z., Yao, T., Ngo, C.W., Tian, X., Mei, T.: Learning spatio-temporal representation with local and global diffusion. In: CVPR, pp. 12056\u201312065 (2019)","DOI":"10.1109\/CVPR.2019.01233"},{"key":"30_CR22","doi-asserted-by":"crossref","unstructured":"Redmon, J., Farhadi, A.: Yolo9000: better, faster, stronger. In: CVPR, pp. 7263\u20137271 (2017)","DOI":"10.1109\/CVPR.2017.690"},{"key":"30_CR23","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NeurIPS, pp. 91\u201399 (2015)"},{"key":"30_CR24","doi-asserted-by":"crossref","unstructured":"Rodriguez, M.D., Ahmed, J., Shah, M.: Action MACH a spatio-temporal maximum average correlation height filter for action recognition. In: CVPR, pp. 1\u20138, June 2008","DOI":"10.1109\/CVPR.2008.4587727"},{"key":"30_CR25","doi-asserted-by":"crossref","unstructured":"Saha, S., Singh, G., Cuzzolin, F.: AMTNet: action-micro-tube regression by end-to-end trainable deep architecture. In: ICCV, pp. 4414\u20134423 (2017)","DOI":"10.1109\/ICCV.2017.473"},{"key":"30_CR26","doi-asserted-by":"crossref","unstructured":"Saha, S., Singh, G., Sapienza, M., Torr, P.H., Cuzzolin, F.: Deep learning for detecting multiple space-time action tubes in videos. In: BMVC (2016)","DOI":"10.5244\/C.30.58"},{"key":"30_CR27","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: NeurIPS, pp. 568\u2013576 (2014)"},{"key":"30_CR28","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"30_CR29","doi-asserted-by":"crossref","unstructured":"Singh, G., Saha, S., Sapienza, M., Torr, P.H., Cuzzolin, F.: Online real-time multiple spatiotemporal action localisation and prediction. In: ICCV, pp. 3637\u20133646 (2017)","DOI":"10.1109\/ICCV.2017.393"},{"key":"30_CR30","doi-asserted-by":"crossref","unstructured":"Song, L., Zhang, S., Yu, G., Sun, H.: TACNet: transition-aware context network for spatio-temporal action detection. In: CVPR, pp. 11987\u201311995 (2019)","DOI":"10.1109\/CVPR.2019.01226"},{"key":"30_CR31","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild (2012)"},{"key":"30_CR32","doi-asserted-by":"crossref","unstructured":"Su, R., Ouyang, W., Zhou, L., Xu, D.: Improving action localization by progressive cross-stream cooperation. In: CVPR, pp. 12016\u201312025 (2019)","DOI":"10.1109\/CVPR.2019.01229"},{"key":"30_CR33","doi-asserted-by":"crossref","unstructured":"Sun, L., Jia, K., Chen, K., Yeung, D.Y., Shi, B.E., Savarese, S.: Lattice long short-term memory for human action recognition. In: ICCV, pp. 2147\u20132156 (2017)","DOI":"10.1109\/ICCV.2017.236"},{"key":"30_CR34","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: ICCV, December 2015","DOI":"10.1109\/ICCV.2015.510"},{"key":"30_CR35","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: CVPR, June 2018","DOI":"10.1109\/CVPR.2018.00813"},{"key":"30_CR36","doi-asserted-by":"crossref","unstructured":"Xu, H., Das, A., Saenko, K.: R-C3D: region convolutional 3D network for temporal activity detection. In: ICCV, pp. 5783\u20135792 (2017)","DOI":"10.1109\/ICCV.2017.617"},{"key":"30_CR37","unstructured":"Yang, T., Zhang, X., Li, Z., Zhang, W., Sun, J.: MetaAnchor: learning to detect objects with customized anchors. In: NeurIPS, pp. 320\u2013330 (2018)"},{"key":"30_CR38","doi-asserted-by":"crossref","unstructured":"Yang, X., Yang, X., Liu, M.Y., Xiao, F., Davis, L.S., Kautz, J.: STEP: spatio-temporal progressive learning for video action detection. In: CVPR, pp. 264\u2013272 (2019)","DOI":"10.1109\/CVPR.2019.00035"},{"key":"30_CR39","doi-asserted-by":"crossref","unstructured":"Zhao, J., Snoek, C.G.: Dance with flow: two-in-one stream action detection. In: CVPR, pp. 9935\u20139944 (2019)","DOI":"10.1109\/CVPR.2019.01017"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58517-4_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,9]],"date-time":"2024-10-09T00:14:55Z","timestamp":1728432895000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58517-4_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585167","9783030585174"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58517-4_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"10 October 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic. From the ECCV Workshops 249 full papers, 18 short papers, and 21 further contributions were published out of a total of 467 submissions.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}