{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T16:59:47Z","timestamp":1780765187242,"version":"3.54.1"},"publisher-location":"Cham","reference-count":52,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031784439","type":"print"},{"value":"9783031784446","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T00:00:00Z","timestamp":1733270400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78444-6_17","type":"book-chapter","created":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T10:41:00Z","timestamp":1733222460000},"page":"252-267","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Towards Completeness: A Generalizable Action Proposal Generator for\u00a0Zero-Shot Temporal Action Localization"],"prefix":"10.1007","author":[{"given":"Jia-Run","family":"Du","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kun-Yu","family":"Lin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jingke","family":"Meng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wei-Shi","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,12,4]]},"reference":[{"key":"17_CR1","doi-asserted-by":"crossref","unstructured":"Buch, S., Eyzaguirre, C., Gaidon, A., Wu, J., Fei-Fei, L., Niebles, J.C.: Revisiting the \u201cvideo\u201d in video-language understanding. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Cao, M., Yang, T., Weng, J., Zhang, C., Wang, J., Zou, Y.: LocVTP: video-text pre-training for temporal localization. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19809-0_3"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"17_CR5","doi-asserted-by":"crossref","unstructured":"Cheng, F., Wang, X., Lei, J., Crandall, D., Bansal, M., Bertasius, G.: VindLU: a recipe for effective video-and-language pretraining. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01034"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Deng, C., Chen, Q., Qin, P., Chen, D., Wu, Q.: Prompt switch: efficient CLIP adaptation for text-video retrieval. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01434"},{"key":"17_CR7","unstructured":"Du, J.R., et al.: Weakly-supervised temporal action localization by progressive complementary learning. arXiv (2022)"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Feng, J.C., Hong, F.T., Zheng, W.S.: MIST: multiple instance self-training framework for video anomaly detection. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01379"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"17_CR10","doi-asserted-by":"crossref","unstructured":"Hong, F.T., Feng, J.C., Xu, D., Shan, Y., Zheng, W.S.: Cross-modal consensus network for weakly supervised temporal action localization. In: ACM MM (2021)","DOI":"10.1145\/3474085.3475298"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Hong, F.T., Huang, X., Li, W.H., Zheng, W.S.: MINI-Net: multiple instance ranking network for video highlight detection. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58601-0_21"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Huang, J., Li, Y., Feng, J., Wu, X., Sun, X., Ji, R.: Clover: towards a unified video-language alignment and fusion model. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01427"},{"key":"17_CR13","unstructured":"Jiang, Y.G., et al.: THUMOS challenge: action recognition with a large number of classes (2014). http:\/\/crcv.ucf.edu\/THUMOS14\/"},{"key":"17_CR14","doi-asserted-by":"crossref","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., Xie, W.: Prompting visual-language models for efficient video understanding. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"17_CR15","unstructured":"Ju, C., et al.: Multi-modal prompting for low-shot temporal action localization. arXiv (2023)"},{"key":"17_CR16","unstructured":"Kuhn, H.W.: The Hungarian method for the assignment problem. Nav. Res. Logist. Q. (1955)"},{"key":"17_CR17","doi-asserted-by":"crossref","unstructured":"Li, D., Li, J., Li, H., Niebles, J.C., Hoi, S.C.: Align and prompt: video-and-language pre-training with entity prompts. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00490"},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Li, Y.M., Huang, W.J., Wang, A.L., Zeng, L.A., Meng, J.K., Zheng, W.S.: EgoExo-Fitness: towards egocentric and exocentric full-body action understanding. In: ECCV (2024)","DOI":"10.1007\/978-3-031-72661-3_21"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Li, Y.M., Zeng, L.A., Meng, J.K., Zheng, W.S.: Continual action assessment via task-consistent score-discriminative feature distribution modeling. TCSVT (2024)","DOI":"10.1109\/TCSVT.2024.3396692"},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Lin, C., et al.: Learning salient boundary feature for anchor-free temporal action localization. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00333"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Lin, K.Q., et al.: UniVTG: towards unified video-language temporal grounding. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"17_CR22","unstructured":"Lin, K.Y., et al.: Rethinking CLIP-based video learners in cross-domain open-vocabulary action recognition. arXiv (2024)"},{"key":"17_CR23","unstructured":"Lin, K.Y., Du, J.R., Gao, Y., Zhou, J., Zheng, W.S.: Diversifying spatial-temporal perception for video domain generalization. In: NeurIPS (2024)"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Lin, K.Y., Zhou, J., Zheng, W.S.: Human-centric transformer for domain adaptive action recognition. TPAMI (2024)","DOI":"10.1109\/TPAMI.2024.3429387"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Lin, T., Liu, X., Li, X., Ding, E., Wen, S.: BMN: boundary-matching network for temporal action proposal generation. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00399"},{"key":"17_CR26","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Su, H., Wang, C., Yang, M.: BSN: boundary sensitive network for temporal action proposal generation. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: End-to-end temporal action detection with transformer. TIP (2022)","DOI":"10.1109\/CVPR52688.2022.01938"},{"key":"17_CR29","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2017)"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Luo, D., Huang, J., Gong, S., Jin, H., Liu, Y.: Towards generalisable video moment retrieval: visual-dynamic injection to image-text pre-training. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02207"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Moon, W., Hyun, S., Park, S., Park, D., Heo, J.P.: Query-dependent video representation for moment retrieval and highlight detection. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"17_CR33","doi-asserted-by":"crossref","unstructured":"Nag, S., Zhu, X., Song, Y.Z., Xiang, T.: Zero-shot temporal action detection via vision-language prompting. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20062-5_39"},{"key":"17_CR34","unstructured":"Paszke, A., et\u00a0al.: PyTorch: an imperative style, high-performance deep learning library. In: NeurIPS (2019)"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Phan, T., Vo, K., Le, D., Doretto, G., Adjeroh, D., Le, N.: ZEETAD: adapting pretrained vision-language model for zero-shot end-to-end temporal action detection. In: WACV (2024)","DOI":"10.1109\/WACV57701.2024.00689"},{"key":"17_CR36","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"17_CR37","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: DenseCLIP: language-guided dense prediction with context-aware prompting. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"17_CR38","doi-asserted-by":"crossref","unstructured":"Shi, D., Zhong, Y., Cao, Q., Ma, L., Li, J., Tao, D.: TriDet: temporal action detection with relative boundary modeling. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01808"},{"key":"17_CR39","doi-asserted-by":"crossref","unstructured":"Shi, D., et al.: ReAct: temporal action detection with relational queries. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20080-9_7"},{"key":"17_CR40","doi-asserted-by":"crossref","unstructured":"Sun, S., Gong, X.: Hierarchical semantic contrast for scene-aware video anomaly detection. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02188"},{"key":"17_CR41","doi-asserted-by":"crossref","unstructured":"Tan, J., Tang, J., Wang, L., Wu, G.: Relaxed transformer decoders for direct action proposal generation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01327"},{"key":"17_CR42","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"17_CR43","doi-asserted-by":"crossref","unstructured":"Wang, A.L., Lin, K.Y., Du, J.R., Meng, J., Zheng, W.S.: Event-guided procedure planning from instructional videos with text supervision. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01248"},{"key":"17_CR44","doi-asserted-by":"crossref","unstructured":"Wu, W., Luo, H., Fang, B., Wang, J., Ouyang, W.: Cap4Video: what can auxiliary captions do for text-video retrieval? In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"17_CR45","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: VideoCLIP: contrastive pre-training for zero-shot video-text understanding. arXiv (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"17_CR46","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhao, C., Rojas, D.S., Thabet, A., Ghanem, B.: G-TAD: sub-graph localization for temporal action detection. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01017"},{"key":"17_CR47","doi-asserted-by":"crossref","unstructured":"Yuan, J., Ni, B., Yang, X., Kassim, A.A.: Temporal action localization with pyramid of score distribution features. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.337"},{"key":"17_CR48","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: Exploiting completeness and uncertainty of pseudo labels for weakly supervised video anomaly detection. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01561"},{"key":"17_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, C.L., Wu, J., Li, Y.: ActionFormer: localizing moments of actions with transformers. In: ECCV. Springer (2022)","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"17_CR50","unstructured":"Zhou, J., Liang, J., Lin, K.Y., Yang, J., Zheng, W.S.: ActionHub: a large-scale action video description dataset for zero-shot action recognition. arXiv (2024)"},{"key":"17_CR51","doi-asserted-by":"crossref","unstructured":"Zhou, J., Lin, K.Y., Li, H., Zheng, W.S.: Graph-based high-order relation modeling for long-term action recognition. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00887"},{"key":"17_CR52","doi-asserted-by":"crossref","unstructured":"Zhou, J., Lin, K.Y., Qiu, Y.K., Zheng, W.S.: TwinFormer: fine-to-coarse temporal modeling for long-term action recognition. TMM (2023)","DOI":"10.1109\/TMM.2023.3302471"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78444-6_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T11:35:49Z","timestamp":1733225749000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78444-6_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,4]]},"ISBN":["9783031784439","9783031784446"],"references-count":52,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78444-6_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,4]]},"assertion":[{"value":"4 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}