{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:48:15Z","timestamp":1777657695737,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62106022, U19B2036, 62225601"],"award-info":[{"award-number":["62106022, U19B2036, 62225601"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Youth Innovative Research Team of BUPT","award":["2023QNTD02"],"award-info":[{"award-number":["2023QNTD02"]}]},{"name":"Beijing Natural Science Foundation Project","award":["Z200002"],"award-info":[{"award-number":["Z200002"]}]},{"name":"National Key R&D Program of China","award":["2022ZD0116309"],"award-info":[{"award-number":["2022ZD0116309"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612551","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"5716-5724","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Category-Specific Prompts for Animal Action Recognition with Pretrained Vision-Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0866-2305","authenticated-orcid":false,"given":"Yinuo","family":"Jing","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9400-9107","authenticated-orcid":false,"given":"Chunyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9635-7563","authenticated-orcid":false,"given":"Ruxu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4726-093X","authenticated-orcid":false,"given":"Kongming","family":"Liang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2950-2488","authenticated-orcid":false,"given":"Zhanyu","family":"Ma","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neuron.2014.09.005"},{"key":"e_1_3_2_1_2_1","volume-title":"Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274 1, 3","author":"Bahng H.","year":"2022","unstructured":"Bahng, H., Jahanian, A., Sankaranarayanan, S., and Isola, P. Exploring visual prompts for adapting large-scale models. arXiv preprint arXiv:2203.17274 1, 3 (2022), 4."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.3390\/ani11020485"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.7554\/eLife.63207"},{"key":"e_1_3_2_1_10_1","first-page":"191","volume-title":"Pattern Recognition: 43rd DAGM German Conf., DAGM GCPR 2021, Bonn, Germany, September 28--October 1, 2021, Proceedings","author":"Gowda S. N.","year":"2022","unstructured":"Gowda, S. N., Sevilla-Lara, L., Kim, K., Keller, F., and Rohrbach, M. A new split for evaluating true zero-shot action recognition. In Pattern Recognition: 43rd DAGM German Conf., DAGM GCPR 2021, Bonn, Germany, September 28--October 1, 2021, Proceedings (2022), Springer, pp. 191--205."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.7554\/eLife.47994"},{"key":"e_1_3_2_1_12_1","first-page":"8776","volume-title":"Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition","author":"Huynh D.","year":"2020","unstructured":"Huynh, D., and Elhamifar, E. A shared multi-attention framework for multilabel zero-shot learning. In Proceedings of the IEEE Conf. on Computer Vision and Pattern Recognition (2020), pp. 8776--8786."},{"key":"e_1_3_2_1_13_1","first-page":"4904","volume-title":"Int'l Conf. on Machine Learning","author":"Jia C.","year":"2021","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., and Duerig, T. Scaling up visual and vision-language representation learning with noisy text supervision. In Int'l Conf. on Machine Learning (2021), pp. 4904--4916."},{"key":"e_1_3_2_1_14_1","first-page":"709","volume-title":"Proceedings, Part XXXIII","author":"Jia M.","year":"2022","unstructured":"Jia, M., Tang, L., Chen, B.-C., Cardie, C., Belongie, S., Hariharan, B., and Lim, S.-N. Visual prompt tuning. In Computer Vision-ECCV 2022: 17th European Conf., Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXIII (2022), Springer, pp. 709--727."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00209"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00324"},{"key":"e_1_3_2_1_17_1","first-page":"105","volume-title":"Tel Aviv, Israel","author":"Ju C.","year":"2022","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., and Xie, W. Prompting visual-language models for efficient video understanding. In Computer Vision-ECCV 2022: 17th European Conf., Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXV (2022), Springer, pp. 105--124."},{"key":"e_1_3_2_1_18_1","volume-title":"Anipose: a toolkit for robust markerless 3d pose estimation. Cell reports 36, 13","author":"Karashchuk P.","year":"2021","unstructured":"Karashchuk, P., Rupp, K. L., Dickinson, E. S., Walling-Bell, S., Sanders, E., Azim, E., Brunton, B. W., and Tuthill, J. C. Anipose: a toolkit for robust markerless 3d pose estimation. Cell reports 36, 13 (2021), 109730."},{"key":"e_1_3_2_1_19_1","volume-title":"Maple: Multi-modal prompt learning. arXiv preprint arXiv:2210.03117","author":"Khattak M. U.","year":"2022","unstructured":"Khattak, M. U., Rasheed, H., Maaz, M., Khan, S., and Khan, F. S. Maple: Multi-modal prompt learning. arXiv preprint arXiv:2210.03117 (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01594-9"},{"key":"e_1_3_2_1_21_1","volume-title":"The power of scale for parameterefficient prompt tuning. arXiv preprint arXiv:2104.08691","author":"Lester B.","year":"2021","unstructured":"Lester, B., Al-Rfou, R., and Constant, N. The power of scale for parameterefficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)."},{"key":"e_1_3_2_1_22_1","first-page":"1404","volume-title":"Proceedings of the AAAI Conf. on Artificial Intelligence","volume":"36","author":"Fei M.","year":"2022","unstructured":"Li, S., Liu, H., Qian, R., Li, Y., See, J., Fei, M., Yu, X., and Lin,W. Ta2n: Two-stage action alignment network for few-shot action recognition. In Proceedings of the AAAI Conf. on Artificial Intelligence (2022), vol. 36, pp. 1404--1411."},{"key":"e_1_3_2_1_23_1","volume-title":"Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190","author":"Li X. L.","year":"2021","unstructured":"Li, X. L., and Liang, P. Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)."},{"key":"e_1_3_2_1_24_1","first-page":"388","volume-title":"Tel Aviv, Israel","author":"Lin Z.","year":"2022","unstructured":"Lin, Z., Geng, S., Zhang, R., Gao, P., de Melo, G.,Wang, X., Dai, J., Qiao, Y., and Li, H. Frozen clip models are efficient video learners. In Computer Vision-ECCV 2022: 17th European Conf., Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXV (2022), Springer, pp. 388--404."},{"key":"e_1_3_2_1_25_1","volume-title":"P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602","author":"Liu X.","year":"2021","unstructured":"Liu, X., Ji, K., Fu, Y., Tam, W. L., Du, Z., Yang, Z., and Tang, J. P-tuning v2: Prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602 (2021)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01844"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/DICTA52665.2021.9647417"},{"key":"e_1_3_2_1_29_1","first-page":"40","volume-title":"Animal recognition and identification with deep convolutional neural networks for automated wildlife monitoring. In 2017 IEEE Int'l Conf. on Data Science and Advanced Analytics (DSAA)","author":"Nguyen H.","year":"2017","unstructured":"Nguyen, H., Maclagan, S. J., Nguyen, T. D., Nguyen, T., Flemons, P., Andrews, K., Ritchie, E. G., and Phung, D. Animal recognition and identification with deep convolutional neural networks for automated wildlife monitoring. In 2017 IEEE Int'l Conf. on Data Science and Advanced Analytics (DSAA) (2017), pp. 40--49."},{"key":"e_1_3_2_1_30_1","first-page":"1","volume-title":"Tel Aviv, Israel","author":"Ni B.","year":"2022","unstructured":"Ni, B., Peng, H., Chen, M., Zhang, S., Meng, G., Fu, J., Xiang, S., and Ling, H. Expanding language-image pretrained models for general video recognition. In Computer Vision-ECCV 2022: 17th European Conf., Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part IV (2022), Springer, pp. 1--18."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/355324.355329"},{"key":"e_1_3_2_1_32_1","first-page":"8748","volume-title":"PMLR","author":"Radford A.","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al. Learning transferable visual models from natural language supervision. In Int'l Conf. on Machine Learning (2021), PMLR, pp. 8748--8763."},{"key":"e_1_3_2_1_33_1","volume-title":"An automatic behavior recognition system classifies animal behaviors using movements and their temporal context. Journal of neuroscience methods 326","author":"Ravbar P.","year":"2019","unstructured":"Ravbar, P., Branson, K., and Simpson, J. H. An automatic behavior recognition system classifies animal behaviors using movements and their temporal context. Journal of neuroscience methods 326 (2019), 108352."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.7554\/eLife.63720"},{"key":"e_1_3_2_1_35_1","volume-title":"Autoprompt: Eliciting knowledge from language models with automatically generated prompts. arXiv preprint arXiv:2010.15980","author":"Shin T.","year":"2020","unstructured":"Shin, T., Razeghi, Y., Logan IV, R. L., Wallace, E., and Singh, S. Autoprompt: Eliciting knowledge from language models with automatically generated prompts. arXiv preprint arXiv:2010.15980 (2020)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093504"},{"key":"e_1_3_2_1_37_1","volume-title":"Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743","author":"Sun C.","year":"2019","unstructured":"Sun, C., Baradel, F., Murphy, K., and Schmid, C. Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743 (2019)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_39_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani A.","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., and Polosukhin, I. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.696"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41386-020-0751-7"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.251"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.58"},{"key":"e_1_3_2_1_44_1","volume-title":"Bidirectional cross-modal knowledge exploration for video recognition with pre-trained visionlanguage models. arXiv preprint arXiv:2301.00182","author":"Wu W.","year":"2022","unstructured":"Wu, W., Wang, X., Luo, H., Wang, J., Yang, Y., and Ouyang, W. Bidirectional cross-modal knowledge exploration for video recognition with pre-trained visionlanguage models. arXiv preprint arXiv:2301.00182 (2022)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.compag.2018.11.002"},{"key":"e_1_3_2_1_46_1","volume-title":"Unified vision and language prompt learning. arXiv preprint arXiv:2210.07225","author":"Zang Y.","year":"2022","unstructured":"Zang, Y., Li,W., Zhou, K., Huang, C., and Loy, C. C. Unified vision and language prompt learning. arXiv preprint arXiv:2210.07225 (2022)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.644"},{"key":"e_1_3_2_1_48_1","volume-title":"Factual probing is [mask]: Learning vs. learning to recall. arXiv preprint arXiv:2104.05240","author":"Zhong Z.","year":"2021","unstructured":"Zhong, Z., Friedman, D., and Chen, D. Factual probing is [mask]: Learning vs. learning to recall. arXiv preprint arXiv:2104.05240 (2021)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612551","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612551","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:56:50Z","timestamp":1755820610000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612551"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":51,"alternative-id":["10.1145\/3581783.3612551","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612551","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}