{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T16:32:32Z","timestamp":1778257952482,"version":"3.51.4"},"publisher-location":"Cham","reference-count":93,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198328","type":"print"},{"value":"9783031198335","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19833-5_7","type":"book-chapter","created":{"date-parts":[[2022,11,4]],"date-time":"2022-11-04T00:40:30Z","timestamp":1667522430000},"page":"105-124","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":303,"title":["Prompting Visual-Language Models for\u00a0Efficient Video Understanding"],"prefix":"10.1007","author":[{"given":"Chen","family":"Ju","sequence":"first","affiliation":[]},{"given":"Tengda","family":"Han","sequence":"additional","affiliation":[]},{"given":"Kunhao","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Ya","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Weidi","family":"Xie","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,11,4]]},"reference":[{"key":"7_CR1","doi-asserted-by":"crossref","unstructured":"Anne Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"7_CR2","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. Proceedings of the International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"7_CR3","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: Proceedings of the International Conference on Machine Learning (2021)"},{"key":"7_CR4","unstructured":"Bishay, M., Zoumpourlis, G., Patras, I.: TARN: temporal attentive relation network for few-shot and zero-shot action recognition. In: Proceedings of the British Machine Vision Conference (2019)"},{"key":"7_CR5","doi-asserted-by":"crossref","unstructured":"Brattoli, B., Tighe, J., Zhdanov, F., Perona, P., Chalupka, K.: Rethinking zero-shot video classification: end-to-end training for realistic applications. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.00467"},{"key":"7_CR6","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems (2020)"},{"key":"7_CR7","unstructured":"Buch, S., Escorcia, V., Ghanem, B., Fei-Fei, L., Niebles, J.C.: End-to-end, single-stream temporal action detection in untrimmed videos. In: Proceedings of the British Machine Vision Conference (2019)"},{"key":"7_CR8","doi-asserted-by":"crossref","unstructured":"Cao, K., Ji, J., Cao, Z., Chang, C.Y., Niebles, J.C.: Few-shot video classification via temporal alignment. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.01063"},{"key":"7_CR9","unstructured":"Carreira, J., Noland, E., Hillier, C., Zisserman, A.: A short note on the kinetics-700 human action dataset. arXiv preprint arXiv:1907.06987 (2019)"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Chao, Y.W., Vijayanarasimhan, S., Seybold, B., Ross, D.A., Deng, J., Sukthankar, R.: Rethinking the faster R-CNN architecture for temporal action localisation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00124"},{"key":"7_CR12","doi-asserted-by":"crossref","unstructured":"Croitoru, I., et al.: TeachText: crossmodal generalized distillation for text-video retrieval. In: Proceedings of the International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.01138"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Dwivedi, S.K., Gupta, V., Mitra, R., Ahmed, S., Jain, A.: ProtoGAN: towards few shot learning for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/ICCVW.2019.00166"},{"key":"7_CR14","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3D: expanding architectures for efficient video recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"7_CR15","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: SlowFast networks for video recognition. In: Proceedings of the International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Convolutional two-stream network fusion for video action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.213"},{"key":"7_CR17","unstructured":"Frome, A., et al.: Devise: a deep visual-semantic embedding model. In: Advances in Neural Information Processing Systems (2013)"},{"key":"7_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"214","DOI":"10.1007\/978-3-030-58548-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"V Gabeur","year":"2020","unstructured":"Gabeur, V., Sun, C., Alahari, K., Schmid, C.: Multi-modal transformer for video retrieval. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12349, pp. 214\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58548-8_13"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Gan, C., Yang, T., Gongi, B.: Learning attributes equals multi-source domain generalization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.17"},{"key":"7_CR20","doi-asserted-by":"publisher","first-page":"61","DOI":"10.1007\/s11263-016-0893-6","volume":"120","author":"C Gan","year":"2016","unstructured":"Gan, C., Yang, Y., Zhu, L., Zhao, D., Zhuang, Y.: Recognizing an action using its name: a knowledge-based approach. Int. J. Comput. Vision 120, 61\u201377 (2016)","journal-title":"Int. J. Comput. Vision"},{"key":"7_CR21","unstructured":"Gao, P., et al.: Clip-adapter: better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544 (2021)"},{"key":"7_CR22","doi-asserted-by":"crossref","unstructured":"Gao, T., Fisch, A., Chen, D.: Making pre-trained language models better few-shot learners. In: Association for Computational Linguistics (2021)","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"7_CR23","doi-asserted-by":"crossref","unstructured":"Gao, Z., Wang, L., Zhang, Q., Niu, Z., Zheng, N., Hua, G.: Video imprint segmentation for temporal action detection in untrimmed videos. In: Proceedings of the AAAI Conference on Artificial Intelligence (2019)","DOI":"10.1609\/aaai.v33i01.33018328"},{"key":"7_CR24","unstructured":"Ha, D., Dai, A., Le, Q.: Hypernetworks. In: Proceedings of the International Conference on Learning Representations (2016)"},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Temporal alignment network for long-term video. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.00292"},{"key":"7_CR26","doi-asserted-by":"crossref","unstructured":"Hara, K., Kataoka, H., Satoh, Y.: Can spatiotemporal 3d CNNs retrace the history of 2d CNNs and ImageNet? In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00685"},{"key":"7_CR27","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"7_CR28","doi-asserted-by":"crossref","unstructured":"Jain, M., Van Gemert, J.C., Mensink, T., Snoek, C.G.: Objects2action: classifying and localizing actions without any video example. In: Proceedings of the International Conference on Computer Vision (2015)","DOI":"10.1109\/ICCV.2015.521"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Jain, M., Van Gemert, J.C., Snoek, C.G.: What do 15,000 object categories tell us about classifying and localizing actions? In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298599"},{"key":"7_CR30","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Proceedings of the International Conference on Machine Learning (2021)"},{"key":"7_CR31","unstructured":"Jia, M., et al.: Visual prompt tuning. arXiv preprint arXiv:2203.12119 (2022)"},{"key":"7_CR32","unstructured":"Jiang, Y.G., et al.: THUMOS challenge: action recognition with a large number of classes (2014). https:\/\/crcv.ucf.edu\/THUMOS14\/"},{"key":"7_CR33","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1162\/tacl_a_00324","volume":"8","author":"Z Jiang","year":"2020","unstructured":"Jiang, Z., Xu, F.F., Araki, J., Neubig, G.: How can we know what language models know? Trans. Assoc. Comput. Linguist. 8, 423\u2013438 (2020)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"7_CR34","doi-asserted-by":"crossref","unstructured":"Ju, C., Zhao, P., Chen, S., Zhang, Y., Wang, Y., Tian, Q.: Divide and conquer for single-frame temporal action localization. In: Proceedings of the International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.01320"},{"key":"7_CR35","doi-asserted-by":"crossref","unstructured":"Ju, C., Zhao, P., Chen, S., Zhang, Y., Zhang, X., Tian, Q.: Adaptive mutual supervision for weakly-supervised temporal action localization. arXiv preprint arXiv:2104.02357 (2021)","DOI":"10.1109\/TMM.2022.3213478"},{"key":"7_CR36","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"7_CR37","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: Proceedings of the International Conference on Computer Vision (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"7_CR38","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: ClipBERT for video-and-language learning via sparse sampling. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"7_CR39","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"7_CR40","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: optimizing continuous prompts for generation. In: Association for Computational Linguistics (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"7_CR41","doi-asserted-by":"crossref","unstructured":"Li, Y., hung Hu, S., Li, B.: Recognizing unseen actions in a domain-adapted embedding space. In: IEEE International Conference on Image Processing (2016)","DOI":"10.1109\/ICIP.2016.7533150"},{"key":"7_CR42","doi-asserted-by":"crossref","unstructured":"Lin, C., et al.: Learning salient boundary feature for anchor-free temporal action localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00333"},{"key":"7_CR43","doi-asserted-by":"crossref","unstructured":"Lin, J., Gan, C., Han, S.: TSM: temporal shift module for efficient video understanding. In: Proceedings of the International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00718"},{"key":"7_CR44","doi-asserted-by":"crossref","unstructured":"Lin, T., Liu, X., Li, X., Ding, E., Wen, S.: BMN: boundary-matching network for temporal action proposal generation. In: Proceedings of the International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00399"},{"key":"7_CR45","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Shou, Z.: Single shot temporal action detection. In: Proceedings of the ACM International Conference on Multimedia (2017)","DOI":"10.1145\/3123266.3123343"},{"key":"7_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-030-01225-0_1","volume-title":"Computer Vision \u2013 ECCV 2018","author":"T Lin","year":"2018","unstructured":"Lin, T., Zhao, X., Su, H., Wang, C., Yang, M.: BSN: boundary sensitive network for temporal action proposal generation. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11208, pp. 3\u201321. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01225-0_1"},{"key":"7_CR47","doi-asserted-by":"crossref","unstructured":"Liu, J., Kuipers, B., Savarese, S.: Recognizing human actions by attributes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2011)","DOI":"10.1109\/CVPR.2011.5995353"},{"key":"7_CR48","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: video retrieval using representations from collaborative experts. In: Proceedings of the British Machine Vision Conference (2019)"},{"key":"7_CR49","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: Proceedings of the International Conference on Learning Representations (2019)"},{"key":"7_CR50","doi-asserted-by":"crossref","unstructured":"Luo, H., et al.: CLIP4Clip: an empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860 (2021)","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"7_CR51","doi-asserted-by":"publisher","first-page":"1954","DOI":"10.1007\/s11263-021-01454-y","volume":"129","author":"P Mettes","year":"2021","unstructured":"Mettes, P., Thong, W., Snoek, C.G.M.: Object priors for classifying and localizing unseen actions. Int. J. Comput. Vision 129, 1954\u20131971 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"7_CR52","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"7_CR53","unstructured":"Miech, A., Laptev, I., Sivic, J.: Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516 (2018)"},{"key":"7_CR54","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1016\/j.neucom.2020.01.078","volume":"390","author":"A Mishra","year":"2020","unstructured":"Mishra, A., Pandey, A., Murthy, H.A.: Zero-shot learning for action recognition using synthesized features. Neurocomputing 390, 117\u2013130 (2020)","journal-title":"Neurocomputing"},{"key":"7_CR55","doi-asserted-by":"crossref","unstructured":"Monfort, M., et al.: Spoken moments: learning joint audio-visual representations from video descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.01463"},{"key":"7_CR56","unstructured":"Monfort, M., et al.: Multi-moments in time: Learning and interpreting models for multi-action video understanding. IEEE Trans. Pattern Anal. Mach. Intell., 1 (2021)"},{"key":"7_CR57","unstructured":"Mori, Y., Takahashi, H., Oka, R.: Image-to-word transformation based on dividing and vector quantizing images with words. In: First International Workshop on Multimedia Intelligent Storage and Retrieval Management (ACM Multimedia Conference) (1999)"},{"key":"7_CR58","unstructured":"Nawhal, M., Mori, G.: Activity graph transformer for temporal action localization. arXiv preprint arXiv:2101.08540 (2021)"},{"key":"7_CR59","doi-asserted-by":"crossref","unstructured":"Perrett, T., Masullo, A., Burghardt, T., Mirmehdi, M., Damen, D.: Temporal relational cross transformers for few-shot action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00054"},{"key":"7_CR60","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning (2021)"},{"key":"7_CR61","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1007\/s11263-016-0987-1","volume":"123","author":"A Rohrbach","year":"2017","unstructured":"Rohrbach, A., et al.: Movie description. Int. J. Comput. Vision 123, 94\u2013120 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"7_CR62","doi-asserted-by":"crossref","unstructured":"Schick, T., Sch\u00fctze, H.: Exploiting cloze questions for few shot text classification and natural language inference. In: Proceedings of the Conference of the European Chapter of the Association for Computational Linguistics (2021)","DOI":"10.18653\/v1\/2021.eacl-main.20"},{"key":"7_CR63","doi-asserted-by":"crossref","unstructured":"Shin, T., Razeghi, Y., IV, R.L.L., Wallace, E., Singh, S.: AutoPrompt: eliciting knowledge from language models with automatically generated prompts. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.346"},{"key":"7_CR64","doi-asserted-by":"crossref","unstructured":"Shou, Z., Chan, J., Zareian, A., Miyazawa, K., Chang, S.F.: CDC: convolutional-de-convolutional networks for precise temporal action localization in untrimmed videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.155"},{"key":"7_CR65","doi-asserted-by":"crossref","unstructured":"Shou, Z., Wang, D., Chang, S.F.: Temporal action localization in untrimmed videos via multi-stage CNNs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.119"},{"key":"7_CR66","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Advances in Neural Information Processing Systems (2014)"},{"key":"7_CR67","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"7_CR68","doi-asserted-by":"crossref","unstructured":"Su, H., Gan, W., Wu, W., Qiao, Y., Yan, J.: BSN++: complementary boundary regressor with scale-balanced relation modeling for temporal action proposal generation. In: Proceedings of the AAAI Conference on Artificial Intelligence (2021)","DOI":"10.1609\/aaai.v35i3.16363"},{"key":"7_CR69","doi-asserted-by":"crossref","unstructured":"Tan, J., Tang, J., Wang, L., Wu, G.: Relaxed transformer decoders for direct action proposal generation. In: Proceedings of the International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.01327"},{"key":"7_CR70","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"7_CR71","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"7_CR72","unstructured":"Wang, M., Xing, J., Liu, Y.: ActionCLIP: a new paradigm for video action recognition. arXiv preprint arXiv:2109.08472 (2021)"},{"key":"7_CR73","doi-asserted-by":"crossref","unstructured":"Wang, Q., Zhang, Y., Zheng, Y., Pan, P.: RCL: recurrent continuous localization for temporal action detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01320"},{"key":"7_CR74","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"7_CR75","unstructured":"Weston, J., Bengio, S., Usunier, N.: WSABIE: scaling up to large vocabulary image annotation. In: Proceedings of the International Joint Conference on Artificial Intelligence (2011)"},{"key":"7_CR76","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1007\/978-3-030-01267-0_19","volume-title":"Computer Vision \u2013 ECCV 2018","author":"S Xie","year":"2018","unstructured":"Xie, S., Sun, C., Huang, J., Tu, Z., Murphy, K.: Rethinking spatiotemporal feature learning: speed-accuracy trade-offs in video classification. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11219, pp. 318\u2013335. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01267-0_19"},{"key":"7_CR77","doi-asserted-by":"crossref","unstructured":"Xu, H., Das, A., Saenko, K.: R-C3D: region convolutional 3d network for temporal activity detection. In: Proceedings of the International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.617"},{"key":"7_CR78","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"7_CR79","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhao, C., Rojas, D.S., Thabet, A., Ghanem, B.: G-TAD: sub-graph localization for temporal action detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.01017"},{"key":"7_CR80","doi-asserted-by":"publisher","first-page":"8535","DOI":"10.1109\/TIP.2020.3016486","volume":"29","author":"L Yang","year":"2020","unstructured":"Yang, L., Peng, H., Zhang, D., Fu, J., Han, J.: Revisiting anchor mechanisms for temporal action localization. IEEE Trans. Image Process. 29, 8535\u20138548 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"7_CR81","unstructured":"Yao, L., et al.: FILIP: fine-grained interactive language-image pre-training. In: Proceedings of the International Conference on Learning Representations (2022)"},{"key":"7_CR82","doi-asserted-by":"crossref","unstructured":"Yeung, S., Russakovsky, O., Mori, G., Fei-Fei, L.: End-to-end learning of action detection from frame glimpses in videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.293"},{"key":"7_CR83","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"487","DOI":"10.1007\/978-3-030-01234-2_29","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Yu","year":"2018","unstructured":"Yu, Y., Kim, J., Kim, G.: A joint sequence fusion model for video question answering and retrieval. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11211, pp. 487\u2013503. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01234-2_29"},{"key":"7_CR84","doi-asserted-by":"crossref","unstructured":"Zhang, C., Wu, J., Li, Y.: ActionFormer: localizing moments of actions with transformers. arXiv preprint arXiv:2202.07925 (2022)","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"7_CR85","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"525","DOI":"10.1007\/978-3-030-58558-7_31","volume-title":"Computer Vision \u2013 ECCV 2020","author":"H Zhang","year":"2020","unstructured":"Zhang, H., Zhang, L., Qi, X., Li, H., Torr, P.H.S., Koniusz, P.: Few-shot action recognition with permutation-invariant attention. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 525\u2013542. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_31"},{"key":"7_CR86","unstructured":"Zhang, R., et al.: Tip-adapter: training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930 (2021)"},{"key":"7_CR87","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"539","DOI":"10.1007\/978-3-030-58598-3_32","volume-title":"Computer Vision \u2013 ECCV 2020","author":"P Zhao","year":"2020","unstructured":"Zhao, P., Xie, L., Ju, C., Zhang, Y., Wang, Y., Tian, Q.: Bottom-up temporal action localization with mutual regularization. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12353, pp. 539\u2013555. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58598-3_32"},{"key":"7_CR88","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xiong, Y., Wang, L., Wu, Z., Tang, X., Lin, D.: Temporal action detection with structured segment networks. In: Proceedings of the International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCV.2017.317"},{"key":"7_CR89","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. arXiv preprint arXiv:2109.01134 (2021)"},{"key":"7_CR90","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"7_CR91","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"782","DOI":"10.1007\/978-3-030-01234-2_46","volume-title":"Computer Vision \u2013 ECCV 2018","author":"L Zhu","year":"2018","unstructured":"Zhu, L., Yang, Y.: Compound memory networks for few-shot video classification. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11211, pp. 782\u2013797. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01234-2_46"},{"key":"7_CR92","first-page":"273","volume":"44","author":"L Zhu","year":"2020","unstructured":"Zhu, L., Yang, Y.: Label independent memory for semi-supervised few-shot video classification. IEEE Trans. Pattern Anal. Mach. Intell. 44, 273\u2013285 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"7_CR93","unstructured":"Zhu, X., Toisoul, A., Perez-Rua, J.M., Zhang, L., Martinez, B., Xiang, T.: Few-shot action recognition with prototype-centered attentive learning. In: Proceedings of the British Machine Vision Conference (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19833-5_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T15:30:47Z","timestamp":1673278247000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19833-5_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198328","9783031198335"],"references-count":93,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19833-5_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"4 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"From the workshops, 367 reviewed full papers have been selected for publication","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}