{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:10:47Z","timestamp":1774602647344,"version":"3.50.1"},"reference-count":66,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,22]],"date-time":"2026-02-22T00:00:00Z","timestamp":1771718400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,22]],"date-time":"2026-02-22T00:00:00Z","timestamp":1771718400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-025-02592-3","type":"journal-article","created":{"date-parts":[[2026,2,22]],"date-time":"2026-02-22T07:56:13Z","timestamp":1771746973000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-Modal Few-Shot Object Detection with Meta-Learning-Based Cross-Modal Prompting"],"prefix":"10.1007","volume":"134","author":[{"given":"Guangxing","family":"Han","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Long","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiawei","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shiyuan","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rama","family":"Chellappa","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shih-Fu","family":"Chang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,22]]},"reference":[{"key":"2592_CR1","doi-asserted-by":"crossref","unstructured":"Bansal, A., Sikka, K., Sharma, G., Chellappa, R., & Divakaran, A. (2018). Zero-shot object detection. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 384\u2013400.","DOI":"10.1007\/978-3-030-01246-5_24"},{"key":"2592_CR2","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., Agarwal, S., Herbert-Voss, A., Krueger, G., Henighan, T., Child, R., Ramesh, A., Ziegler, D., Wu, J., Winter, C., Hesse, C., Chen, M., Sigler, E., Litwin, M., Gray, S., Chess, B., Clark, J., Berner, C., McCandlish, S., Radford, A., Sutskever, I., Amodei, D. (2020). Language models are few-shot learners. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M. F., & Lin, H. (eds) Advances in Neural Information Processing Systems, Curran Associates, Inc., vol 33, pp 1877\u20131901."},{"key":"2592_CR3","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., & Zagoruyko, S. (2020). End-to-end object detection with transformers. In: European Conference on Computer Vision, Springer, pp 213\u2013229.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2592_CR4","doi-asserted-by":"crossref","unstructured":"Chen, D. J., Hsieh, H. Y., & Liu, T. L. (2021). Adaptive image transformer for one-shot object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 12247\u201312256.","DOI":"10.1109\/CVPR46437.2021.01207"},{"key":"2592_CR5","unstructured":"Chen, T. I., Liu, Y. C., Su, H. T., Chang, Y. C., Lin, Y. H., Yeh, J. F., Chen, W. C., & Hsu, W. (2021). Dual-awareness attention for few-shot object detection. IEEE Transactions on Multimedia pp 1\u20131."},{"key":"2592_CR6","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M. W., Lee, K., & Toutanova, K. (2019). BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), Association for Computational Linguistics, Minneapolis, Minnesota, pp 4171\u20134186, https:\/\/doi.org\/10.18653\/v1\/N19-1423, https:\/\/aclanthology.org\/N19-1423.","DOI":"10.18653\/v1\/N19-1423"},{"key":"2592_CR7","first-page":"21981","volume":"33","author":"C Doersch","year":"2020","unstructured":"Doersch, C., Gupta, A., & Zisserman, A. (2020). Crosstransformers: Spatially-aware few-shot transfer. Advances in Neural Information Processing Systems, Curran Associates Inc, 33, 21981\u201321993.","journal-title":"Advances in Neural Information Processing Systems, Curran Associates Inc"},{"issue":"2","key":"2592_CR8","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C. K., Winn, J., & Zisserman, A. (2010). The pascal visual object classes (voc) challenge. International journal of computer vision, 88(2), 303\u2013338.","journal-title":"International journal of computer vision"},{"key":"2592_CR9","doi-asserted-by":"crossref","unstructured":"Fan, Q., Zhuo, W., Tang, C. K., & Tai, Y. W. (2020). Few-shot object detection with attention-rpn and multi-relation detector. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 4013\u20134022.","DOI":"10.1109\/CVPR42600.2020.00407"},{"key":"2592_CR10","unstructured":"Finn, C., Abbeel, P., & Levine, S. (2017). Model-agnostic meta-learning for fast adaptation of deep networks. In: International Conference on Machine Learning, pp 1126\u20131135."},{"key":"2592_CR11","doi-asserted-by":"crossref","unstructured":"Gidaris, S., & Komodakis, N. (2018). Dynamic few-shot visual learning without forgetting. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 4367\u20134375.","DOI":"10.1109\/CVPR.2018.00459"},{"key":"2592_CR12","doi-asserted-by":"crossref","unstructured":"Girshick, R. (2015). Fast r-cnn. In: Proceedings of the IEEE international conference on computer vision, pp 1440\u20131448.","DOI":"10.1109\/ICCV.2015.169"},{"key":"2592_CR13","unstructured":"Gu, X., Lin, T. Y., Kuo, W., & Cui, Y. (2022). Open-vocabulary object detection via vision and language knowledge distillation. In: International Conference on Learning Representations."},{"key":"2592_CR14","doi-asserted-by":"crossref","unstructured":"Han, G., & Lim, S. N. (2024). Few-shot object detection with foundation models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 28608\u201328618.","DOI":"10.1109\/CVPR52733.2024.02703"},{"key":"2592_CR15","doi-asserted-by":"crossref","unstructured":"Han, G., He, Y., Huang, S., Ma, J., & Chang, S. F. (2021). Query adaptive few-shot object detection with heterogeneous graph convolutional networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp 3263\u20133272.","DOI":"10.1109\/ICCV48922.2021.00325"},{"key":"2592_CR16","doi-asserted-by":"crossref","unstructured":"Han, G., Huang, S., Ma, J., He, Y., & Chang, S. F. (2022). Meta faster r-cnn: Towards accurate few-shot object detection with attentive feature alignment. In: Thirty-Sixth AAAI Conference on Artificial Intelligence (AAAI).","DOI":"10.1609\/aaai.v36i1.19959"},{"key":"2592_CR17","doi-asserted-by":"crossref","unstructured":"Han, G., Ma, J., Huang, S., Chen, L., & Chang, S. F. (2022). Few-shot object detection with fully cross-transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR52688.2022.00525"},{"key":"2592_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2592_CR19","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision, pp 2961\u20132969.","DOI":"10.1109\/ICCV.2017.322"},{"key":"2592_CR20","unstructured":"Hinton, G., Vinyals, O., Dean, J., et al. (2015). Distilling the knowledge in a neural network. arXiv:1503.02531."},{"key":"2592_CR21","unstructured":"Hsieh, T. I., Lo, Y. C., Chen, H. T., & Liu, T. L. (2019). One-shot object detection with co-attention and co-excitation. In: Advances in Neural Information Processing Systems, pp 2725\u20132734."},{"key":"2592_CR22","doi-asserted-by":"crossref","unstructured":"Huang, S., Ma, J., Han, G., & Chang, S. F. (2022). Task-adaptive negative class envision for few-shot open-set recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR52688.2022.00703"},{"key":"2592_CR23","doi-asserted-by":"crossref","unstructured":"Joseph, K., Khan, S., Khan, F. S., & Balasubramanian, V. N. (2021). Towards open world object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5830\u20135840.","DOI":"10.1109\/CVPR46437.2021.00577"},{"key":"2592_CR24","doi-asserted-by":"crossref","unstructured":"Kang, B., Liu, Z., Wang, X., Yu, F., Feng, J., & Darrell, T. (2019). Few-shot object detection via feature reweighting. In: Proceedings of the IEEE International Conference on Computer Vision, pp 8420\u20138429.","DOI":"10.1109\/ICCV.2019.00851"},{"key":"2592_CR25","doi-asserted-by":"crossref","unstructured":"Karlinsky, L., Shtok, J., Harary, S., Schwartz, E., Aides, A., Feris, R., Giryes, R., & Bronstein, A. M. (2019). Repmet: Representative-based metric learning for classification and few-shot object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 5197\u20135206.","DOI":"10.1109\/CVPR.2019.00534"},{"key":"2592_CR26","unstructured":"Kim, W., Son, B., & Kim, I. (2021). Vilt: Vision-and-language transformer without convolution or region supervision. In: Meila, M., Zhang, T. (eds) Proceedings of the 38th International Conference on Machine Learning, PMLR, Proceedings of Machine Learning Research, vol 139, pp 5583\u20135594, http:\/\/proceedings.mlr.press\/v139\/kim21k.html."},{"key":"2592_CR27","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., & Constant, N. (2021). The power of scale for parameter-efficient prompt tuning. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, Online and Punta Cana, Dominican Republic, pp 3045\u20133059.","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"2592_CR28","unstructured":"Li, B., Weinberger, K. Q., Belongie, S., Koltun, V., & Ranftl, R. (2022). Language-driven semantic segmentation. In: International Conference on Learning Representations."},{"key":"2592_CR29","doi-asserted-by":"crossref","unstructured":"Li, X. L., & Liang, P. (2021). Prefix-tuning: Optimizing continuous prompts for generation. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Association for Computational Linguistics, Online, pp 4582\u20134597.","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"2592_CR30","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft coco: Common objects in context. In: European conference on computer vision, Springer, pp 740\u2013755.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2592_CR31","doi-asserted-by":"crossref","unstructured":"Lin, T. Y., Goyal, P., Girshick, R., He, K., & Doll\u00e1r, P. (2017). Focal loss for dense object detection. In: Proceedings of the IEEE international conference on computer vision, pp 2980\u20132988.","DOI":"10.1109\/ICCV.2017.324"},{"key":"2592_CR32","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., & Neubig, G. (2021). Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. arXiv:2107.13586."},{"key":"2592_CR33","doi-asserted-by":"crossref","unstructured":"Liu, W., Anguelov, D., Erhan, D., Szegedy, C., Reed, S., Fu, CY., & Berg, A. C. (2016). Ssd: Single shot multibox detector. In: European conference on computer vision, Springer, pp 21\u201337.","DOI":"10.1007\/978-3-319-46448-0_2"},{"key":"2592_CR34","doi-asserted-by":"crossref","unstructured":"Ma, J., Xie, H., Han, G., Chang, S. F., Galstyan, A., & Abd-Almageed, W. (2021). Partner-assisted learning for few-shot image classification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp 10573\u201310582.","DOI":"10.1109\/ICCV48922.2021.01040"},{"key":"2592_CR35","doi-asserted-by":"crossref","unstructured":"Ma, J., Zhang, X., Wu, Y., Hedau, V., & Chang, S. F. (2022). Few-shot gaze estimation with model offset predictors. In: International Conference on Acoustics, Speech and Signal Processing (ICASSP).","DOI":"10.1109\/ICASSP43922.2022.9747640"},{"key":"2592_CR36","unstructured":"Oquab, M., Darcet, T., Moutakanni, T., Vo, H., Szafraniec, M., Khalidov, V., Fernandez, P., Haziza, D., Massa, F., El-Nouby, A. et al. (2023). Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193."},{"key":"2592_CR37","doi-asserted-by":"crossref","unstructured":"Qiao, L., Zhao, Y., Li, Z., Qiu, X., Wu, J., & Zhang, C. (2021). Defrcn: Decoupled faster r-cnn for few-shot object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp 8681\u20138690.","DOI":"10.1109\/ICCV48922.2021.00856"},{"key":"2592_CR38","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al. (2021). Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, PMLR, pp 8748\u20138763."},{"key":"2592_CR39","doi-asserted-by":"crossref","unstructured":"Rao, Y., Zhao, W., Chen, G., Tang, Y., Zhu, Z., Huang, G., Zhou, J., & Lu, J. (2021). Denseclip: Language-guided dense prediction with context-aware prompting. arXiv:2112.01518.","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"2592_CR40","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., & Farhadi, A. (2016). You only look once: Unified, real-time object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 779\u2013788.","DOI":"10.1109\/CVPR.2016.91"},{"key":"2592_CR41","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems, pp 91\u201399."},{"key":"2592_CR42","doi-asserted-by":"crossref","unstructured":"Schick, T., & Sch\u00fctze, H. (2021). Exploiting cloze-questions for few-shot text classification and natural language inference. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, Association for Computational Linguistics, Online, pp 255\u2013269.","DOI":"10.18653\/v1\/2021.eacl-main.20"},{"key":"2592_CR43","doi-asserted-by":"crossref","unstructured":"Shin, T., Razeghi, Y., Logan IV, R. L., Wallace, E., & Singh, S. (2020). AutoPrompt: Eliciting Knowledge from Language Models with Automatically Generated Prompts. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), Association for Computational Linguistics, Online, pp 4222\u20134235.","DOI":"10.18653\/v1\/2020.emnlp-main.346"},{"key":"2592_CR44","unstructured":"Snell, J., Swersky, K., & Zemel, R. (2017). Prototypical networks for few-shot learning. In: Advances in neural information processing systems, pp 4077\u20134087."},{"key":"2592_CR45","unstructured":"Su, W., Zhu, X., Cao, Y., Li, B., Lu, L., Wei, F., & Dai, J. (2020). Vl-bert: Pre-training of generic visual-linguistic representations. In: International Conference on Learning Representations, https:\/\/openreview.net\/forum?id=SygXPaEYvH."},{"key":"2592_CR46","doi-asserted-by":"crossref","unstructured":"Sun, B., Li, B., Cai, S., Yuan, Y., & Zhang, C. (2021). Fsce: Few-shot object detection via contrastive proposal encoding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 7352\u20137362.","DOI":"10.1109\/CVPR46437.2021.00727"},{"key":"2592_CR47","doi-asserted-by":"crossref","unstructured":"Sung, F., Yang, Y., Zhang, L., Xiang, T., Torr, P. H., & Hospedales, T. M. (2018). Learning to compare: Relation network for few-shot learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 1199\u20131208.","DOI":"10.1109\/CVPR.2018.00131"},{"key":"2592_CR48","doi-asserted-by":"crossref","unstructured":"Tan, H., & Bansal, M. (2019). Lxmert: Learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing.","DOI":"10.18653\/v1\/D19-1514"},{"key":"2592_CR49","unstructured":"Tsimpoukelli, M., Menick, J., Cabi, S., Eslami, S. M. A., Vinyals, O., & Hill, F. (2021). Multimodal few-shot learning with frozen language models. In: Advances in Neural Information Processing Systems."},{"key":"2592_CR50","unstructured":"Vinyals, O., Blundell, C., Lillicrap, T., Wierstra, D. et al. (2016). Matching networks for one shot learning. In: Advances in neural information processing systems, pp 3630\u20133638."},{"key":"2592_CR51","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., & He, K. (2018). Non-local neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7794\u20137803.","DOI":"10.1109\/CVPR.2018.00813"},{"key":"2592_CR52","unstructured":"Wang, X., Huang, T. E., Darrell, T., Gonzalez, J. E., & Yu, F. (2020). Frustratingly simple few-shot object detection. In: International Conference on Machine Learning (ICML)."},{"key":"2592_CR53","doi-asserted-by":"crossref","unstructured":"Wang, Y. X., Ramanan, D., & Hebert, M. (2019). Meta-learning to detect rare objects. In: Proceedings of the IEEE International Conference on Computer Vision, pp 9925\u20139934.","DOI":"10.1109\/ICCV.2019.01002"},{"key":"2592_CR54","doi-asserted-by":"crossref","unstructured":"Wu, A., Han, Y., Zhu, L., & Yang, Y. (2021). Universal-prototype enhancing for few-shot object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp 9567\u20139576.","DOI":"10.1109\/ICCV48922.2021.00943"},{"key":"2592_CR55","doi-asserted-by":"crossref","unstructured":"Wu, J., Liu, S., Huang, D., & Wang, Y. (2020). Multi-scale positive sample refinement for few-shot object detection. In: European Conference on Computer Vision, Springer, pp 456\u2013472.","DOI":"10.1007\/978-3-030-58517-4_27"},{"key":"2592_CR56","doi-asserted-by":"crossref","unstructured":"Xiao, Y., & Marlet, R. (2020). Few-shot object detection and viewpoint estimation for objects in the wild. In: European Conference on Computer Vision.","DOI":"10.1007\/978-3-030-58520-4_12"},{"key":"2592_CR57","unstructured":"Xing, C., Rostamzadeh, N., Oreshkin, B., & O Pinheiro, P. O. (2019). Adaptive cross-modal few-shot learning. Advances in Neural Information Processing Systems 32."},{"key":"2592_CR58","doi-asserted-by":"crossref","unstructured":"Yan, X., Chen, Z., Xu, A., Wang, X., Liang, X., & Lin, L. (2019). Meta r-cnn: Towards general solver for instance-level low-shot learning. In: Proceedings of the IEEE International Conference on Computer Vision, pp 9577\u20139586.","DOI":"10.1109\/ICCV.2019.00967"},{"key":"2592_CR59","unstructured":"Yao, Y., Zhang, A., Zhang, Z., Liu, Z., Chua, T. S., & Sun, M. (2021). Cpt: Colorful prompt tuning for pre-trained vision-language models. arXiv:2109.11797."},{"key":"2592_CR60","unstructured":"Ypsilantis, N. A., Garcia, N., Han, G., Ibrahimi, S., Van Noord, N., & Tolias, G. (2021). The met dataset: Instance-level recognition for artworks. In: Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)."},{"key":"2592_CR61","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K. D., Hu, D. H., & Chang, S. F. (2021). Open-vocabulary object detection using captions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14393\u201314402.","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"2592_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, W., & Wang, Y. X. (2021). Hallucination improves few-shot object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 13008\u201313017.","DOI":"10.1109\/CVPR46437.2021.01281"},{"key":"2592_CR63","unstructured":"Zhang, X., Liu, Y., Wang, Y., Boularias, A. (2024). Detect everything with few examples. In: 8th Annual Conference on Robot Learning."},{"key":"2592_CR64","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2021). Learning to prompt for vision-language models. arXiv:2109.01134."},{"key":"2592_CR65","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022). Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 16816\u201316825.","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"2592_CR66","doi-asserted-by":"crossref","unstructured":"Zhu, C., Chen, F., Ahmed, U., Shen, Z., & Savvides, M. (2021). Semantic relation reasoning for shot-stable few-shot object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 8782\u20138791.","DOI":"10.1109\/CVPR46437.2021.00867"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02592-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02592-3","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02592-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:33:42Z","timestamp":1774600422000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02592-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,22]]},"references-count":66,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2592"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02592-3","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,22]]},"assertion":[{"value":"13 February 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"135"}}