{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T05:16:30Z","timestamp":1751606190395,"version":"3.37.3"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030208899"},{"type":"electronic","value":"9783030208905"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-20890-5_36","type":"book-chapter","created":{"date-parts":[[2019,6,1]],"date-time":"2019-06-01T11:18:34Z","timestamp":1559387914000},"page":"558-574","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Cascaded Pyramid Mining Network for Weakly Supervised Temporal Action Localization"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4228-7439","authenticated-orcid":false,"given":"Haisheng","family":"Su","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8176-623X","authenticated-orcid":false,"given":"Xu","family":"Zhao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5535-279X","authenticated-orcid":false,"given":"Tianwei","family":"Lin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,6,2]]},"reference":[{"key":"36_CR1","unstructured":"Abadi, M., Agarwal, A., Barham, P., et al.: Tensorflow: large-scale machine learning on heterogeneous distributed systems. arXiv preprint \n                      arXiv:1603.04467\n                      \n                     (2016)"},{"key":"36_CR2","doi-asserted-by":"crossref","unstructured":"Bai, P., Tang, X., Wang, X., Liu, W.: Multiple instance detection network with online instance classifier refinement. In: CVPR, pp. 4322\u20134328 (2017)","DOI":"10.1109\/CVPR.2017.326"},{"key":"36_CR3","doi-asserted-by":"crossref","unstructured":"Bilen, H., Vedaldi, A.: Weakly supervised deep detection networks. In: CVPR, pp. 2846\u20132854 (2016)","DOI":"10.1109\/CVPR.2016.311"},{"key":"36_CR4","doi-asserted-by":"crossref","unstructured":"Caba Heilbron, F., Escorcia, V., Ghanem, B., Carlos Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"36_CR5","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., Vijayanarasimhan, S., Seybold, B., Ross, D.A., Deng, J., Sukthankar, R.: Rethinking the faster R-CNN architecture for temporal action localization. In: CVPR, pp. 1130\u20131139 (2018)","DOI":"10.1109\/CVPR.2018.00124"},{"key":"36_CR6","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Convolutional two-stream network fusion for video action recognition. In: CVPR, pp. 1933\u20131941 (2016)","DOI":"10.1109\/CVPR.2016.213"},{"key":"36_CR7","doi-asserted-by":"crossref","unstructured":"Gao, J., Yang, Z., Nevatia, R.: Cascaded boundary regression for temporal action detection. arXiv preprint \n                      arXiv:1705.01180\n                      \n                     (2017)","DOI":"10.5244\/C.31.52"},{"key":"36_CR8","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: CVPR, pp. 580\u2013587 (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"36_CR9","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Dollar, P., Girshick, R.: Mask-RCNN. \n                      arXiv:1703.06870v2\n                      \n                     (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"36_CR10","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"36_CR11","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Barrios, W., Escorcia, V., Ghanem, B.: SCC: semantic context cascade for efficient action detection. In: CVPR, pp. 3175\u20133184 (2017)","DOI":"10.1109\/CVPR.2017.338"},{"key":"36_CR12","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. arXiv preprint \n                      arXiv:1502.03167\n                      \n                     (2015)"},{"key":"36_CR13","doi-asserted-by":"crossref","unstructured":"Jia, Y., et al.: Caffe: convolutional architecture for fast feature embedding. In: Proceedings of the 22nd ACM International Conference on Multimedia, pp. 675\u2013678. ACM (2014)","DOI":"10.1145\/2647868.2654889"},{"key":"36_CR14","unstructured":"Jiang, Y., et al.: THUMOS challenge: action recognition with a large number of classes. In: Computer Vision-ECCV Workshop 2014 (2014)"},{"key":"36_CR15","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Shou, Z.: Single shot temporal action detection. In: Proceedings of the 2017 ACM on Multimedia Conference, pp. 988\u2013996. ACM (2017)","DOI":"10.1145\/3123266.3123343"},{"key":"36_CR16","unstructured":"Lin, T., Zhao, X., Shou, Z.: Temporal convolution based action proposal: submission to activitynet 2017. arXiv preprint \n                      arXiv:1707.06750\n                      \n                     (2017)"},{"key":"36_CR17","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Su, H., Wang, C., Yang, M.: BSN: boundary sensitive network for temporal action proposal generation. arXiv preprint \n                      arXiv:1806.02964\n                      \n                     (2018)","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"36_CR18","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R.B., He, K., Hariharan, B., Belongie, S.J.: Feature pyramid networks for object detection. In: CVPR, vol. 1, p. 4 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"36_CR19","unstructured":"Oneata, D., Verbeek, J., Schmid, C.: The LEAR submission at THUMOS2014. THUMOS Action Recognition challenge (2014)"},{"key":"36_CR20","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: unified, real-time object detection. In: CVPR, pp. 779\u2013788 (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"36_CR21","doi-asserted-by":"crossref","unstructured":"Redmon, J., Farhadi, A.: Yolo9000: better, faster, stronger. arXiv preprint \n                      arXiv:1612.08242\n                      \n                     (2016)","DOI":"10.1109\/CVPR.2017.690"},{"key":"36_CR22","doi-asserted-by":"crossref","unstructured":"Richard, A., Gall, J.: Temporal action detection using a statistical language model. In: CVPR, pp. 3131\u20133140 (2016)","DOI":"10.1109\/CVPR.2016.341"},{"key":"36_CR23","doi-asserted-by":"crossref","unstructured":"Shou, Z., Chan, J., Zareian, A., Miyazawa, K., Chang, S.F.: CDC: convolutional-de-convolutional networks for precise temporal action localization in untrimmed videos. In: CVPR, pp. 1417\u20131426. IEEE (2017)","DOI":"10.1109\/CVPR.2017.155"},{"key":"36_CR24","doi-asserted-by":"crossref","unstructured":"Shou, Z., Wang, D., Chang, S.F.: Temporal action localization in untrimmed videos via multi-stage CNNs. In: CVPR, pp. 1049\u20131058 (2016)","DOI":"10.1109\/CVPR.2016.119"},{"key":"36_CR25","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Advances in Neural Information Processing Systems, pp. 568\u2013576 (2014)"},{"key":"36_CR26","unstructured":"Singh, G., Cuzzolin, F.: Untrimmed video classification for activity detection: submission to activitynet challenge. arXiv preprint \n                      arXiv:1607.01979\n                      \n                     (2016)"},{"key":"36_CR27","doi-asserted-by":"crossref","unstructured":"Singh, K.K., Lee, Y.J.: Hide-and-seek: forcing a network to be meticulous for weakly-supervised object and action localization. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.381"},{"key":"36_CR28","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"426","DOI":"10.1007\/978-3-030-04212-7_37","volume-title":"Neural Information Processing","author":"H Su","year":"2018","unstructured":"Su, H., Zhao, X., Lin, T., Fei, H.: Weakly supervised temporal action detection with shot-based temporal pooling network. In: Cheng, L., Leung, A.C.S., Ozawa, S. (eds.) ICONIP 2018. LNCS, vol. 11304, pp. 426\u2013436. Springer, Cham (2018). \n                      https:\/\/doi.org\/10.1007\/978-3-030-04212-7_37"},{"key":"36_CR29","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: ICCV, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"36_CR30","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/s11263-013-0620-5","volume":"104","author":"JR Uijlings","year":"2013","unstructured":"Uijlings, J.R., Van De Sande, K.E., Gevers, T., Smeulders, A.W.: Selective search for object recognition. IJCV 104, 154\u2013171 (2013)","journal-title":"IJCV"},{"key":"36_CR31","doi-asserted-by":"crossref","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C., Liu, C.L.: Action recognition by dense trajectories. In: CVPR, pp. 3169\u20133176. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"36_CR32","doi-asserted-by":"crossref","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: ICCV, pp. 3551\u20133558 (2013)","DOI":"10.1109\/ICCV.2013.441"},{"key":"36_CR33","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Lin, D., Van Gool, L.: UntrimmedNets for weakly supervised action recognition and detection. In: CVPR, vol. 2 (2017)","DOI":"10.1109\/CVPR.2017.678"},{"key":"36_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). \n                      https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"36_CR35","unstructured":"Wang, R., Tao, D.: UTS at activitynet 2016. ActivityNet large scale activity recognition challenge 2016, 8 (2016)"},{"key":"36_CR36","doi-asserted-by":"crossref","unstructured":"Wei, Y., Feng, J., Liang, X., Cheng, M.M., Zhao, Y., Yan, S.: Object region mining with adversarial erasing: a simple classification to semantic segmentation approach. In: CVPR, vol. 1, p. 3 (2017)","DOI":"10.1109\/CVPR.2017.687"},{"key":"36_CR37","unstructured":"Xiong, Y., Zhao, Y., Wang, L., Lin, D., Tang, X.: A pursuit of temporal accuracy in general activity detection. arXiv preprint \n                      arXiv:1703.02716\n                      \n                     (2017)"},{"key":"36_CR38","doi-asserted-by":"crossref","unstructured":"Xu, H., Das, A., Saenko, K.: R-C3D: region convolutional 3D network for temporal activity detection. In: ICCV, pp. 5794\u20135803 (2017)","DOI":"10.1109\/ICCV.2017.617"},{"key":"36_CR39","doi-asserted-by":"crossref","unstructured":"Yuan, Z.H., Stroud, J.C., Lu, T., Deng, J.: Temporal action localization by structured maximal sums. In: CVPR, vol. 2, p. 7 (2017)","DOI":"10.1109\/CVPR.2017.342"},{"key":"36_CR40","doi-asserted-by":"publisher","first-page":"1084","DOI":"10.1007\/s11263-017-1059-x","volume":"126","author":"J Zhang","year":"2018","unstructured":"Zhang, J., Bargal, S.A., Lin, Z., Brandt, J., Shen, X., Sclaroff, S.: Top-down neural attention by excitation backprop. IJCV 126, 1084\u20131102 (2018)","journal-title":"IJCV"},{"key":"36_CR41","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xiong, Y., Wang, L., Wu, Z., Tang, X., Lin, D.: Temporal action detection with structured segment networks. In: ICCV, vol. 2 (2017)","DOI":"10.1109\/ICCV.2017.317"},{"key":"36_CR42","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: CVPR, pp. 2921\u20132929 (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"36_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"391","DOI":"10.1007\/978-3-319-10602-1_26","volume-title":"Computer Vision \u2013 ECCV 2014","author":"CL Zitnick","year":"2014","unstructured":"Zitnick, C.L., Doll\u00e1r, P.: Edge boxes: locating object proposals from edges. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 391\u2013405. Springer, Cham (2014). \n                      https:\/\/doi.org\/10.1007\/978-3-319-10602-1_26"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2018"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-20890-5_36","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,6,1]],"date-time":"2019-06-01T11:26:28Z","timestamp":1559388388000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-20890-5_36"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030208899","9783030208905"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-20890-5_36","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"2 June 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Perth, WA","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Australia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 December 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 December 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/accv2018.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"Microsoft CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"979","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"274","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"2.7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}}]}}