{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,23]],"date-time":"2026-03-23T18:53:27Z","timestamp":1774292007669,"version":"3.50.1"},"reference-count":79,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2025,6,4]],"date-time":"2025-06-04T00:00:00Z","timestamp":1748995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,4]],"date-time":"2025-06-04T00:00:00Z","timestamp":1748995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"the National Nature Science Foundation of China","doi-asserted-by":"crossref","award":["62476029"],"award-info":[{"award-number":["62476029"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100001809","name":"the National Nature Science Foundation of China","doi-asserted-by":"crossref","award":["62225601, U23B2052"],"award-info":[{"award-number":["62225601, U23B2052"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"the Beijing Natural Science Foundation Project","award":["L242025, Z200002"],"award-info":[{"award-number":["L242025, Z200002"]}]},{"DOI":"10.13039\/501100005090","name":"Beijing Nova Program","doi-asserted-by":"publisher","award":["62476029"],"award-info":[{"award-number":["62476029"]}],"id":[{"id":"10.13039\/501100005090","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s11263-025-02408-4","type":"journal-article","created":{"date-parts":[[2025,6,4]],"date-time":"2025-06-04T08:02:10Z","timestamp":1749024130000},"page":"5062-5082","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Animal-CLIP: A Dual-Prompt Enhanced Vision-Language Model for Animal Action Recognition"],"prefix":"10.1007","volume":"133","author":[{"given":"Yinuo","family":"Jing","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4726-093X","authenticated-orcid":false,"given":"Kongming","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Ruxu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Yongxiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Zhongjiang","family":"He","sequence":"additional","affiliation":[]},{"given":"Zhanyu","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,4]]},"reference":[{"issue":"1","key":"2408_CR1","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1016\/j.neuron.2014.09.005","volume":"84","author":"DJ Anderson","year":"2014","unstructured":"Anderson, D. J., & Perona, P. (2014). Toward a science of computational ethology. Neuron, 84(1), 18\u201331.","journal-title":"Neuron"},{"issue":"3","key":"2408_CR2","first-page":"4","volume":"1","author":"H Bahng","year":"2022","unstructured":"Bahng, H., Jahanian, A., Sankaranarayanan, S., & Isola, P. (2022). Exploring visual prompts for adapting large-scale models, 1(3), 4. arXiv:2203.17274.","journal-title":"Exploring visual prompts for adapting large-scale models"},{"key":"2408_CR3","unstructured":"Bourdev, L. (2012) Dataset of keypoints and foreground annotations for all categories of pascal 2011"},{"key":"2408_CR4","doi-asserted-by":"crossref","unstructured":"Carreira, J., & Zisserman, A. (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 6299\u20136308).","DOI":"10.1109\/CVPR.2017.502"},{"key":"2408_CR5","unstructured":"Chatgpt, 2022."},{"key":"2408_CR6","unstructured":"Chen, F., Han, M., Zhao, H., Zhang, Q., Shi, J., Xu, S., & Xu, B. (2023) X-llm: Bootstrapping advanced large language models by treating multi-modalities as foreign languages. arXiv preprint arXiv:2305.04160"},{"key":"2408_CR7","doi-asserted-by":"crossref","unstructured":"Chen, J., Hu, M., Coker, D. J., Berumen, M. L., Costelloe, B., Beery, S., Rohrbach, A., & Elhoseiny, M. (2023) Mammalnet: A large-scale video benchmark for mammal recognition and behavior understanding. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, (pp. 13052\u201313061)","DOI":"10.1109\/CVPR52729.2023.01254"},{"key":"2408_CR8","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-016-0939-9","volume":"121","author":"L Del Pero","year":"2017","unstructured":"Del Pero, L., Ricco, S., Sukthankar, R., & Ferrari, V. (2017). Behavior discovery and alignment of articulated object classes from unstructured video. International Journal of Computer Vision, 121, 303\u2013325.","journal-title":"International Journal of Computer Vision"},{"key":"2408_CR9","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., (2020) X3d: Expanding architectures for efficient video recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (pp. 203\u2013213)","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"2408_CR10","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., & He, K.(2019) Slowfast networks for video recognition. In Proceedings of the IEEE Int\u2019l Conference on Computer Vision (pp. 6202\u20136211)","DOI":"10.1109\/ICCV.2019.00630"},{"issue":"2","key":"2408_CR11","doi-asserted-by":"publisher","first-page":"485","DOI":"10.3390\/ani11020485","volume":"11","author":"L Feng","year":"2021","unstructured":"Feng, L., Zhao, Y., Sun, Y., Zhao, W., & Tang, J. (2021). Action recognition using a spatial-temporal network for wild felines. Animals, 11(2), 485.","journal-title":"Animals"},{"key":"2408_CR12","doi-asserted-by":"publisher","DOI":"10.7554\/eLife.63207","volume":"10","author":"BQ Geuther","year":"2021","unstructured":"Geuther, B. Q., Peer, A., He, H., Sabnis, G., Philip, V. M., & Kumar, V. (2021). Action detection using a neural network elucidates the genetics of mouse grooming behavior. Elife, 10, Article e63207.","journal-title":"Elife"},{"key":"2408_CR13","unstructured":"Gpt4, 2023."},{"key":"2408_CR14","doi-asserted-by":"publisher","DOI":"10.7554\/eLife.47994","volume":"8","author":"JM Graving","year":"2019","unstructured":"Graving, J. M., Chae, D., Naik, H., Li, L., Koger, B., Costelloe, B. R., & Couzin, I. D. (2019). Deepposekit, a software toolkit for fast and robust animal pose estimation using deep learning. Elife, 8, Article e47994.","journal-title":"Elife"},{"key":"2408_CR15","doi-asserted-by":"crossref","unstructured":"Gu, X., Chen, G., Wang, Y., Zhang, L., Luo, T., & Wen, L. (2023) Text with knowledge graph augmented transformer for video captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 18941\u201318951)","DOI":"10.1109\/CVPR52729.2023.01816"},{"key":"2408_CR16","unstructured":"Huang, X., Huang, Y. J., Zhang, Y., Tian, W., Feng, R., Zhang, Y., Xie, Y., Li, Y., & Zhang, L. (2023) Open-set image tagging with multi-grained text supervision. arXiv e-prints, pages arXiv\u20132310"},{"key":"2408_CR17","doi-asserted-by":"crossref","unstructured":"Hudson, D. A., & Manning, C. D. (2019) Gqa: A new dataset for real-world visual reasoning and compositional question answering. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition pp. 6700\u20136709","DOI":"10.1109\/CVPR.2019.00686"},{"key":"2408_CR18","doi-asserted-by":"crossref","unstructured":"Huynh, D., & Elhamifar, E. (2020) A shared multi-attention framework for multi-label zero-shot learning. In Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition pp. 8776\u20138786","DOI":"10.1109\/CVPR42600.2020.00880"},{"key":"2408_CR19","doi-asserted-by":"crossref","unstructured":"Jia, M., Tang, L., Chen, B. C., Cardie, C., Belongie, S., Hariharan, B., & Lim, S. N. (2022) Visual prompt tuning. In Computer Vision\u2013ECCV 2022: 17th European Conf., Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXIII pp. 709\u2013727. Springer","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"2408_CR20","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y. T., Parekh, Z., Pham, H., Quoc Le, Sung, Y-H., Zhen Li, & Duerig, T. (2021) Scaling up visual and vision-language representation learning with noisy text supervision. In Int\u2019l Conference on Machine Learning (pp. 4904\u20134916)"},{"key":"2408_CR21","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1162\/tacl_a_00324","volume":"8","author":"Z Jiang","year":"2020","unstructured":"Jiang, Z., Xu, F. F., Araki, J., & Neubig, G. (2020). How can we know what language models know? Transactions of the Association for Computational Linguistics, 8, 423\u2013438.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"2408_CR22","doi-asserted-by":"crossref","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., & Xie, W. (2022) Prompting visual-language models for efficient video understanding. In Computer Vision\u2013ECCV 2022: 17th European Conf., Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXV pp. 105\u2013124. Springer","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"2408_CR23","doi-asserted-by":"crossref","unstructured":"Kampffmeyer, M., Chen, Y., Liang, X., Wang, H., Zhang, Y., & Xing, E. P. (2019) Rethinking knowledge graph propagation for zero-shot learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 11487\u201311496)","DOI":"10.1109\/CVPR.2019.01175"},{"issue":"13","key":"2408_CR24","doi-asserted-by":"publisher","DOI":"10.1016\/j.celrep.2021.109730","volume":"36","author":"P Karashchuk","year":"2021","unstructured":"Karashchuk, P., Rupp, K. L., Dickinson, E. S., Walling-Bell, S., Sanders, E., Azim, E., Brunton, B. W., & Tuthill, J. C. (2021). Anipose: a toolkit for robust markerless 3d pose estimation. Cell Reports, 36(13), Article 109730.","journal-title":"Cell Reports"},{"key":"2408_CR25","doi-asserted-by":"crossref","unstructured":"Khattak, M. U., Rasheed, H., Maaz, M., Khan, S., & Khan, F. S. (2022) Maple: Multi-modal prompt learning. arXiv preprint arXiv:2210.03117","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"2408_CR26","unstructured":"Khosla, A., Jayadevaprakash, N., Yao, B., & Li, F. F. (2011) Novel dataset for fine-grained image categorization: Stanford dogs. In Proceedings CVPR workshop on fine-grained visual categorization (FGVC), volume\u00a02"},{"key":"2408_CR27","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., He, Y., Li, Y., Wang, Y., Liu, Y., Wang, Z., Xu, J., Chen, G., Luo, P., et\u00a0al. (2023) Mvbench: A comprehensive multi-modal video understanding benchmark. arXiv preprint arXiv:2311.17005","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"2408_CR28","unstructured":"Li, Y., Wu, C. Y., Fan, H., Mangalam, K., Xiong, B., Malik, J., & Feichtenhofer, C. (2021) Multiscale vision transformers. In Proceedings of the IEEE Int\u2019l Conf. on Computer Vision pp. 6824\u20136835"},{"key":"2408_CR29","unstructured":"Li, B., Zhang, Y., Chen, L., Wang, J., Pu, F., Yang, J., & Liu, Z. Otter: A multi-modal model with in-context instruction tuning."},{"key":"2408_CR30","doi-asserted-by":"crossref","unstructured":"Liang, K., Wang, X., Wei, T., Chen, W., Ma, Z., & Guo, J. (2023a) Attribute learning with knowledge enhanced partial annotations. In 2023 IEEE International Conference on Image Processing (ICIP), pp. 1715\u20131719. IEEE","DOI":"10.1109\/ICIP49359.2023.10222277"},{"key":"2408_CR31","doi-asserted-by":"crossref","unstructured":"Liang, K., Wang, X., Zhang, H., Ma, Z., Guo, J. (2023b) Hierarchical visual attribute learning in the wild. In Proceedings of the 31st ACM International Conference on Multimedia, pp. 3415\u20133423","DOI":"10.1145\/3581783.3612274"},{"key":"2408_CR32","doi-asserted-by":"crossref","unstructured":"Lin, Z., Geng, S., Zhang, R., Gao, P., De Melo, G., Wang, X., Dai, J., Qiao, Y., & Li, H. (2022) Frozen clip models are efficient video learners. In Computer Vision\u2013ECCV 2022: 17th European Conf., Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXV pages 388\u2013404. Springer","DOI":"10.1007\/978-3-031-19833-5_23"},{"key":"2408_CR33","doi-asserted-by":"crossref","unstructured":"Lin, B., Ye, Y., Zhu, B., Cui, J., Ning, M., Jin, P., & Yuan, L. (2023) Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"2408_CR34","doi-asserted-by":"crossref","unstructured":"Liu, D., Hou, J., Huang, S., Liu, J., He, Y., Zheng, B., & Zhang, J. (2023) Lote-animal: A long time-span dataset for endangered animal behavior understanding. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (pp. 20064\u201320075)","DOI":"10.1109\/ICCV51070.2023.01836"},{"key":"2408_CR35","doi-asserted-by":"crossref","unstructured":"Mathis, A., Biasi, T., Schneider, S., Yuksekgonul, M., Rogers, B., Bethge, M., & Mathis, M. W. (2021) Pretraining boosts out-of-domain robustness for pose estimation. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision pp. 1859\u20131868","DOI":"10.1109\/WACV48630.2021.00190"},{"issue":"1","key":"2408_CR36","doi-asserted-by":"publisher","first-page":"8137","DOI":"10.1038\/s41598-019-44565-w","volume":"9","author":"Z Miao","year":"2019","unstructured":"Miao, Z., Gaynor, K. M., Wang, J., Liu, Z., Muellerklein, O., Norouzzadeh, M. S., McInturff, A., Bowie, R. C. K., Nathan, R., Yu, S. X., et al. (2019). Insights and approaches using deep learning to classify wildlife. Scientific Reports, 9(1), 8137.","journal-title":"Scientific Reports"},{"key":"2408_CR37","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J. B., Tapaswi, M., Laptev, I., & Sivic, J. (2019) Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In Proceedings of the IEEE Int\u2019l Conference on Computer Vision, pp. 2630\u20132640","DOI":"10.1109\/ICCV.2019.00272"},{"key":"2408_CR38","doi-asserted-by":"crossref","unstructured":"Mondal, A., Nag, S., Prada, J. M., Zhu, X., & Dutta, A. (2023) Actor-agnostic multi-label action recognition with multi-modal query. In Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) Workshops (pp. 784\u2013794)","DOI":"10.1109\/ICCVW60793.2023.00086"},{"key":"2408_CR39","doi-asserted-by":"crossref","unstructured":"Naeem, M. F., Xian, Y., Tombari, F., & Akata, Z. (2021) Learning graph embeddings for compositional zero-shot learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (pp. 953\u2013962)","DOI":"10.1109\/CVPR46437.2021.00101"},{"key":"2408_CR40","doi-asserted-by":"crossref","unstructured":"Ng, X. L., Ong, K. E., Zheng, Q., Ni, Y., Yeo, S. Y., & Liu, J. (2022) Animal kingdom: A large and diverse dataset for animal behavior understanding. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, (pp. 19023\u201319034).","DOI":"10.1109\/CVPR52688.2022.01844"},{"key":"2408_CR41","doi-asserted-by":"crossref","unstructured":"Nguyen, H., Maclagan, S. J., Nguyen, T. D., Nguyen, T., Flemons, P., Andrews, K., & Phung, D. (2017) Animal recognition and identification with deep convolutional neural networks for automated wildlife monitoring. In 2017 IEEE international conference on Data Science and Advanced Analytics (DSAA) (pp. 40\u201349).","DOI":"10.1109\/DSAA.2017.31"},{"key":"2408_CR42","doi-asserted-by":"crossref","unstructured":"Nguyen, C., Wang, D., Von Richter, K., Valencia, P., Alvarenga, F. A., & Bishop\u2013Hurley, G. (2021) Video-based cattle identification and action recognition. In 2021 Digital Image Computing: Techniques and Applications (DICTA) (pp. 01\u201305).","DOI":"10.1109\/DICTA52665.2021.9647417"},{"key":"2408_CR43","doi-asserted-by":"crossref","unstructured":"Ni, B., Peng, H., Chen, M., Zhang, S., Meng, G., Fu, J., Xiang, S., & Ling, H. (2022) Expanding language-image pretrained models for general video recognition. In Computer Vision\u2013ECCV 2022: 17th European Conf., Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part IV (pp. 1\u201318) Springer","DOI":"10.1007\/978-3-031-19772-7_1"},{"issue":"3","key":"2408_CR44","doi-asserted-by":"publisher","first-page":"417","DOI":"10.1145\/355324.355329","volume":"7","author":"J Pascoe","year":"2000","unstructured":"Pascoe, J., Ryan, N., & Morse, D. (2000). Using while moving: Hci issues in fieldwork environments. ACM Transactions on Computer-Human Interaction (TOCHI), 7(3), 417\u2013437.","journal-title":"ACM Transactions on Computer-Human Interaction (TOCHI)"},{"key":"2408_CR45","doi-asserted-by":"crossref","unstructured":"Pero, L. D., Ricco, S., Sukthankar, R., & Ferrari, V. (2015) Articulated motion discovery using pairs of trajectories. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2151\u20132160).","DOI":"10.1109\/CVPR.2015.7298827"},{"key":"2408_CR46","doi-asserted-by":"crossref","unstructured":"Pratt, S., Covert, I., Liu, R., & Farhadi, A. (2022) What does a platypus look like? generating customized prompts for zero-shot image classification","DOI":"10.1109\/ICCV51070.2023.01438"},{"key":"2408_CR47","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al. (2021) Learning transferable visual models from natural language supervision. In Int\u2019l Conference on Machine Learning (pp. 8748\u20138763)"},{"key":"2408_CR48","doi-asserted-by":"publisher","DOI":"10.1016\/j.jneumeth.2019.108352","volume":"326","author":"P Ravbar","year":"2019","unstructured":"Ravbar, P., Branson, K., & Simpson, J. H. (2019). An automatic behavior recognition system classifies animal behaviors using movements and their temporal context. Journal of neuroscience methods, 326, Article 108352.","journal-title":"Journal of neuroscience methods"},{"key":"2408_CR49","unstructured":"Romanelli, C., Cooper, D., Campbell-Lendrum, D., Maiero, M., Karesh, W. B., Hunter, D., & Golden, C. D. (2015) Connecting global priorities: biodiversity and human health: a state of knowledge review. World Health Organistion\/Secretariat of the UN Convention on Biological."},{"key":"2408_CR50","doi-asserted-by":"publisher","DOI":"10.7554\/eLife.63720","volume":"10","author":"C Segalin","year":"2021","unstructured":"Segalin, C., Williams, J., Karigo, T., Hui, M., Zelikowsky, M., Sun, J. J., Perona, P., Anderson, D. J., & Kennedy, A. (2021). The mouse action recognition system (mars) software pipeline for automated analysis of social behaviors in mice. Elife, 10, Article e63720.","journal-title":"Elife"},{"key":"2408_CR51","doi-asserted-by":"crossref","unstructured":"Shah, S., Mishra, A., Yadati, N., & Talukdar, P. P. (2019). Kvqa: Knowledge-aware visual question answering. In Proceedings of the AAAI conference on artificial intelligence (pp. 8876-8884)","DOI":"10.1609\/aaai.v33i01.33018876"},{"key":"2408_CR52","first-page":"15558","volume":"35","author":"S Shen","year":"2022","unstructured":"Shen, S., Li, C., Xiaowei, H., Xie, Y., Yang, J., Zhang, P., Gan, Z., Lijuan Wang, L., Yuan, C. L., et al. (2022). K-lite: Learning transferable visual models with external knowledge. Advances in Neural Information Processing Systems, 35, 15558\u201315573.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2408_CR53","doi-asserted-by":"crossref","unstructured":"Shin, T., Razeghi, Y., Logan IV, R. L., Wallace, E., & Singh, S. (2020) Autoprompt: Eliciting knowledge from language models with automatically generated prompts. arXiv preprint arXiv:2010.15980","DOI":"10.18653\/v1\/2020.emnlp-main.346"},{"key":"2408_CR54","doi-asserted-by":"crossref","unstructured":"Singh, A., Pietrasik, M., Natha, G., Ghouaiel, N., Brizel, K., & Ray, N. (2020) Animal detection in man-made environments. In Proceedings of the IEEE Winter Conference on Applications of Computer Vision, (pp. 1438\u20131449).","DOI":"10.1109\/WACV45572.2020.9093504"},{"key":"2408_CR55","unstructured":"Su, Y., Lan, T., Li, H., Xu, J., Wang, Y., & Cai, D. (2023) Pandagpt: One model to instruction-follow them all. arXiv preprint arXiv:2305.16355"},{"key":"2408_CR56","unstructured":"Sun, C., Baradel, F., Murphy, K., & Schmid, C. (2019a) Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743"},{"key":"2408_CR57","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., & Schmid, C. (2019b) Videobert: A joint model for video and language representation learning. In Proceedings of the IEEE Int\u2019l Conferernce on Computer Vision pp. 7464\u20137473","DOI":"10.1109\/ICCV.2019.00756"},{"key":"2408_CR58","unstructured":"Taori, R., Gulrajani, I., Zhang, T., Dubois, Y., Li, X., Guestrin, C., Liang, P., & Hashimoto, T. B. (2023) Alpaca: A strong, replicable instruction-following model. Stanford Center for Research on Foundation Models. https:\/\/crfm. stanford. edu\/2023\/03\/13\/alpaca. html, 3(6):7"},{"key":"2408_CR59","doi-asserted-by":"crossref","unstructured":"Tapanainen, P., Piitulainen, J., & Jarvinen, T. (1998) Idiomatic object usage and support verbs. In COLING 1998 Volume 2: The 17th International Conference on Computational Linguistics","DOI":"10.3115\/980432.980779"},{"key":"2408_CR60","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M. A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., et\u00a0al. (2023) Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971"},{"key":"2408_CR61","doi-asserted-by":"crossref","unstructured":"Van Horn, G., Mac Aodha, O., Song, Y., Cui, Y., Sun, C., Shepard, A., Adam, H., Perona, P., & Belongie, S. (2018) The inaturalist species classification and detection dataset. In Proceedings of the IEEE conference on computer vision and pattern recognition pp. 8769\u20138778","DOI":"10.1109\/CVPR.2018.00914"},{"key":"2408_CR62","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., & Polosukhin, I. (2017) Attention is all you need. Advances in neural information processing systems"},{"key":"2408_CR63","doi-asserted-by":"crossref","unstructured":"Veit, A., Alldrin, N., Chechik, G., Krasin, I., Gupta, A., & Belongie, S. (2017) Learning from noisy large-scale datasets with minimal supervision. In Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition pp. 839\u2013847","DOI":"10.1109\/CVPR.2017.696"},{"issue":"1","key":"2408_CR64","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1038\/s41386-020-0751-7","volume":"46","author":"L von Ziegler","year":"2021","unstructured":"von Ziegler, L., Sturman, O., & Bohacek, J. (2021). Big behavior: Challenges and opportunities in a new era of deep behavior profiling. Neuropsychopharmacology, 46(1), 33\u201344.","journal-title":"Neuropsychopharmacology"},{"key":"2408_CR65","doi-asserted-by":"crossref","unstructured":"Wang, Z., Chen, T., Li, G., Xu, R., & Lin, L. (2017) Multi-label image recognition by recurrently discovering attentional regions. In Proceedings of the IEEE Int\u2019l Conf. on Computer Vision (pp. 464\u2013472)","DOI":"10.1109\/ICCV.2017.58"},{"key":"2408_CR66","unstructured":"Wang, J., Chen, D., Luo, C., Dai, X., Yuan, L., Wu, Z., & Jiang, Y. G. (2023) Chatvideo: A tracklet-centric multimodal and versatile video understanding system. arXiv preprint arXiv:2304.14407"},{"key":"2408_CR67","doi-asserted-by":"crossref","unstructured":"Wang, J., Yang, Y., Mao, J., Huang, Z., Huang, C., & Xu, W. (2016) Cnn-rnn: A unified framework for multi-label image classification. In Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition (pp. 2285\u20132294)","DOI":"10.1109\/CVPR.2016.251"},{"key":"2408_CR68","first-page":"5696","volume":"35","author":"J Wang","year":"2022","unstructured":"Wang, J., Chen, D., Zuxuan, W., Luo, C., Zhou, L., Zhao, Y., Xie, Y., Liu, C., Jiang, Y.-G., & Yuan, L. (2022). Omnivl: One foundation model for image-language and video-language tasks. Advances in neural information processing systems, 35, 5696\u20135710.","journal-title":"Advances in neural information processing systems"},{"issue":"2","key":"2408_CR69","doi-asserted-by":"publisher","first-page":"392","DOI":"10.1007\/s11263-023-01876-w","volume":"132","author":"W Wenhao","year":"2024","unstructured":"Wenhao, W., Sun, Z., Song, Y., Wang, J., & Ouyang, W. (2024). Transferring vision-language models for visual recognition: A classifier perspective. International Journal of Computer Vision, 132(2), 392\u2013409.","journal-title":"International Journal of Computer Vision"},{"key":"2408_CR70","doi-asserted-by":"crossref","unstructured":"Wu, W., Wang, X., Luo, H., Wang, J., Yang, Y., & Ouyang, W. (2022) Bidirectional cross-modal knowledge exploration for video recognition with pre-trained vision-language models. arXiv preprint arXiv:2301.00182","DOI":"10.1109\/CVPR52729.2023.00640"},{"key":"2408_CR71","doi-asserted-by":"crossref","unstructured":"Wu, Y., Zhang, G., Gao, Y., Deng, X., Gong, K., Liang, X., & Lin, L. (2020) Bidirectional graph reasoning network for panoptic segmentation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition pp. 9080\u20139089","DOI":"10.1109\/CVPR42600.2020.00910"},{"key":"2408_CR72","doi-asserted-by":"publisher","first-page":"453","DOI":"10.1016\/j.compag.2018.11.002","volume":"155","author":"Q Yang","year":"2018","unstructured":"Yang, Q., Xiao, D., & Lin, S. (2018). Feeding behavior recognition for group-housed pigs with the faster r-cnn. Computers and Electronics in Agriculture, 155, 453\u2013460.","journal-title":"Computers and Electronics in Agriculture"},{"key":"2408_CR73","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., & Loy, C. C. 2022 Unified vision and language prompt learning. arXiv preprint arXiv:2210.07225"},{"key":"2408_CR74","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., & Bing, L. (2023) Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"2408_CR75","doi-asserted-by":"crossref","unstructured":"Zhao, W., & Wu, X. (2023) Boosting entity-aware image captioning with multi-modal knowledge graph. IEEE Transactions on Multimedia","DOI":"10.1109\/TMM.2023.3301279"},{"key":"2408_CR76","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Friedman, D., & Chen, D. (2021) Factual probing is [mask]: Learning vs. learning to recall. arXiv preprint arXiv:2104.05240","DOI":"10.18653\/v1\/2021.naacl-main.398"},{"key":"2408_CR77","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022a) Conditional prompt learning for vision-language models. In Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition pp. 16816\u201316825","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"2408_CR78","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022b). Learning to prompt for vision-language models. Int\u2019l Journal of Computer Vision, 130(9), 2337\u20132348.","journal-title":"Int\u2019l Journal of Computer Vision"},{"key":"2408_CR79","doi-asserted-by":"crossref","unstructured":"Zhu, L., & Yang, Y. (2020) Actbert: Learning global-local video-text representations. In Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition, pp. 8746\u20138755","DOI":"10.1109\/CVPR42600.2020.00877"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02408-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02408-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02408-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T17:37:29Z","timestamp":1757180249000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02408-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,4]]},"references-count":79,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["2408"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02408-4","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,6,4]]},"assertion":[{"value":"18 September 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 June 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declared that they have no Conflict of interest to this work.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}