{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:11:05Z","timestamp":1774602665808,"version":"3.50.1"},"reference-count":69,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T00:00:00Z","timestamp":1770336000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T00:00:00Z","timestamp":1770336000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.62172089;No.62172090;No.62106045"],"award-info":[{"award-number":["No.62172089;No.62172090;No.62106045"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004608","name":"Natural Science Foundation of Jiangsu Province","doi-asserted-by":"publisher","award":["No.BK20191258"],"award-info":[{"award-number":["No.BK20191258"]}],"id":[{"id":"10.13039\/501100004608","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Jiangsu Provincial Key Laboratory of Computer Networking Technology"},{"name":"Jiangsu Provincial Key Laboratory of Network and Information Security","award":["No.BM2003201"],"award-info":[{"award-number":["No.BM2003201"]}]},{"DOI":"10.13039\/501100011151","name":"Key Laboratory of Computer Network and Information Integration, Ministry of Education","doi-asserted-by":"publisher","award":["No.93K-9"],"award-info":[{"award-number":["No.93K-9"]}],"id":[{"id":"10.13039\/501100011151","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Nanjing Purple Mountain Laboratories"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-026-02737-y","type":"journal-article","created":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T04:54:30Z","timestamp":1770353670000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["AutoIT: Automated Image Tagging with Random Perturbation"],"prefix":"10.1007","volume":"134","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7676-2843","authenticated-orcid":false,"given":"Xuelin","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Jianshu","family":"Li","sequence":"additional","affiliation":[]},{"given":"Jian","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Dongqi","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Jiawei","family":"Ge","sequence":"additional","affiliation":[]},{"given":"Weijia","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Jiuxin","family":"Cao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,6]]},"reference":[{"key":"2737_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S., & Anadkat S et\u00a0al. (2023). Gpt-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"2737_CR2","unstructured":"Agrawal, P., Antoniak, S., Hanna, E.B., Bout, B., Chaplot, D., Chudnovsky, J., Costa, D., De\u00a0Monicault, B., Garg, S., & Gervet T et\u00a0al. (2024). Pixtral 12b. arXiv preprint arXiv:2410.07073"},{"key":"2737_CR3","doi-asserted-by":"crossref","unstructured":"Alfassy, A., Karlinsky, L., Aides, A., Shtok, J., Harary, S., Feris, R., Giryes, R., & Bronstein A.M. (2019). Laso: Label-set operations networks for multi-label few-shot learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 6548\u20136557","DOI":"10.1109\/CVPR.2019.00671"},{"key":"2737_CR4","doi-asserted-by":"crossref","unstructured":"Ancha, S., Osteen, P.R., Roy, N. (2024). Deep evidential uncertainty estimation for semantic segmentation under out-of-distribution obstacles. In: 2024 IEEE International Conference on Robotics and Automation (ICRA), IEEE, pp 6943\u20136951","DOI":"10.1109\/ICRA57147.2024.10611342"},{"key":"2737_CR5","doi-asserted-by":"crossref","unstructured":"Ben-Cohen, A., Zamir, N., Ben-Baruch, E., Friedman, I., & Zelnik-Manor, L. (2021). Semantic diversity learning for zero-shot multi-label classification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 640\u2013650","DOI":"10.1109\/ICCV48922.2021.00068"},{"key":"2737_CR6","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. (2020). Language models are few-shot learners. Advances in neural information processing systems,33, 1877\u20131901.","journal-title":"Advances in neural information processing systems"},{"key":"2737_CR7","doi-asserted-by":"crossref","unstructured":"Chen, S.F., Chen, Y.C., Yeh, C.K., & Wang, Y.C.F. (2018a). Order-free rnn with visual attention for multi-label classification. In: Thirty-Second AAAI Conference on Artificial Intelligence, pp 6714\u20136721","DOI":"10.1609\/aaai.v32i1.12230"},{"key":"2737_CR8","doi-asserted-by":"crossref","unstructured":"Chen, T., Wang, Z., Li, G., & Lin, L. (2018b). Recurrent attentional reinforcement learning for multi-label image recognition. In: Proceedings of the AAAI conference on artificial intelligence, vol\u00a032","DOI":"10.1609\/aaai.v32i1.12281"},{"key":"2737_CR9","doi-asserted-by":"crossref","unstructured":"Chen, T., Xu, M., Hui, X., Wu, H., & Lin, L. (2019a). Learning semantic-specific graph representation for multi-label image recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 522\u2013531","DOI":"10.1109\/ICCV.2019.00061"},{"key":"2737_CR10","doi-asserted-by":"publisher","first-page":"339","DOI":"10.1609\/aaai.v36i1.19910","volume":"36","author":"T Chen","year":"2022","unstructured":"Chen, T., Pu, T., Wu, H., Xie, Y., & Lin, L. (2022). Structured semantic transfer for multi-label recognition with partial labels. Proceedings of the AAAI conference on artificial intelligence,36, 339\u2013346.","journal-title":"Proceedings of the AAAI conference on artificial intelligence"},{"key":"2737_CR11","doi-asserted-by":"crossref","unstructured":"Chen, Z.M., Wei, X.S., Wang, P., & Guo, Y. (2019b). Multi-label image recognition with graph convolutional networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5177\u20135186","DOI":"10.1109\/CVPR.2019.00532"},{"key":"2737_CR12","unstructured":"Chiang, W.L., Li, Z., Lin, Z., Sheng, Y., Wu, Z., Zhang, H., Zheng, L., Zhuang, S., Zhuang, Y., Gonzalez, J.E., Stoica, I., & Xing, E.P. (2023a). Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"2737_CR13","unstructured":"Chiang, W.L., Li, Z., Lin, Z., Sheng, Y., Wu, Z., Zhang, H., Zheng, L., Zhuang, S., Zhuang, Y., & Gonzalez, J.E., et\u00a0al. (2023b). Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality. See https:\/\/vicuna lmsys org (accessed 14 April 2023)"},{"key":"2737_CR14","doi-asserted-by":"crossref","unstructured":"Chua, T.S., Tang, J., Hong, R., Li, H., Luo, Z., & Zheng, Y. (2009). Nus-wide: a real-world web image database from national university of singapore. In: Proceedings of the ACM international conference on image and video retrieval, pp 1\u20139","DOI":"10.1145\/1646396.1646452"},{"key":"2737_CR15","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition, Ieee, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2737_CR16","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., & Gelly, S. et\u00a0al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"2737_CR17","doi-asserted-by":"crossref","unstructured":"Du, Z., Qian, Y., Liu, X., Ding, M., Qiu, J., Yang, Z., & Tang, J. (2021). Glm: General language model pretraining with autoregressive blank infilling. arXiv preprint arXiv:2103.10360","DOI":"10.18653\/v1\/2022.acl-long.26"},{"key":"2737_CR18","doi-asserted-by":"crossref","unstructured":"Durand, T., Mehrasa, N., & Mori, G. (2019). Learning a deep convnet for multi-label classification with partial labels. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 647\u2013657","DOI":"10.1109\/CVPR.2019.00074"},{"issue":"2","key":"2737_CR19","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C. K., Winn, J., & Zisserman, A. (2010). The pascal visual object classes (voc) challenge. International journal of computer vision,88(2), 303\u2013338.","journal-title":"International journal of computer vision"},{"key":"2737_CR20","doi-asserted-by":"crossref","unstructured":"Guo, Z., Dong, B., Ji, Z., Bai, J., Guo, Y., & Zuo, W. (2023). Texts as images in prompt tuning for multi-label image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 2808\u20132817","DOI":"10.1109\/CVPR52729.2023.00275"},{"key":"2737_CR21","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"2737_CR22","unstructured":"Huang, X., Zhang, Y., Ma, J., Tian, W., Feng, R., Zhang, Y., Li, Y., Guo, Y., & Zhang, L. (2023). Tag2text: Guiding vision-language model via image tagging. arXiv preprint arXiv:2303.05657"},{"key":"2737_CR23","doi-asserted-by":"crossref","unstructured":"Huynh, D., & Elhamifar, E. (2020). A shared multi-attention framework for multi-label zero-shot learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8776\u20138786","DOI":"10.1109\/CVPR42600.2020.00880"},{"key":"2737_CR24","unstructured":"Imam, M.F., Marew, R.F., Hassan, J., Fiaz, M., Aji, A.F., & Cholakkal, H. (2024). Clip meets dino for tuning zero-shot classifier using unlabeled image collections. arXiv preprint arXiv:2411.19346"},{"key":"2737_CR25","doi-asserted-by":"publisher","first-page":"4230","DOI":"10.1609\/aaai.v39i4.32444","volume":"39","author":"MU Khattak","year":"2025","unstructured":"Khattak, M. U., Naeem, M. F., Naseer, M., Van Gool, L., & Tombari, F. (2025). Learning to prompt with text only supervision for vision-language models. Proceedings of the AAAI Conference on Artificial Intelligence,39, 4230\u20134238.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2737_CR26","volume-title":"Learning multiple layers of features from tiny images","author":"A Krizhevsky","year":"2009","unstructured":"Krizhevsky, A., Hinton, G., et al. (2009). Learning multiple layers of features from tiny images. Rep: Department of Computer Science, University of Toronto Tech."},{"key":"2737_CR27","doi-asserted-by":"crossref","unstructured":"Lanchantin, J., Wang, T., Ordonez, V., & Qi, Y. (2021). General multi-label image classification with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16478\u201316488","DOI":"10.1109\/CVPR46437.2021.01621"},{"key":"2737_CR28","first-page":"1","volume":"1","author":"X Li","year":"2014","unstructured":"Li, X., Zhao, F., & Guo, Y. (2014). Multi-label image classification with a probabilistic label enhancement model. UAI,1, 1\u201310.","journal-title":"UAI"},{"key":"2737_CR29","unstructured":"Li, X., Dai, Y., Ge, Y., Liu, J., Shan, Y.,& Duan, L.Y. (2022). Uncertainty modeling for out-of-distribution generalization. arXiv preprint arXiv:2202.03958"},{"key":"2737_CR30","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.L. (2014). Microsoft coco: Common objects in context. In: European conference on computer vision, Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2737_CR31","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.J. (2023). Visual instruction tuning. arXiv preprint arXiv:2304.08485"},{"key":"2737_CR32","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.J. (2024). Visual instruction tuning. Advances in neural information processing systems 36"},{"key":"2737_CR33","unstructured":"Liu, S., Zhang, L., Yang, X., Su, H., & Zhu, J. (2021). Query2label: A simple transformer way to multi-label classification. arXiv preprint arXiv:2107.10834"},{"key":"2737_CR34","unstructured":"Loshchilov, I., & Hutter, F. (2017). Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101"},{"key":"2737_CR35","unstructured":"Van\u00a0der Maaten, L., & Hinton, G. (2008). Visualizing data using t-sne. Journal of machine learning research 9(11)"},{"key":"2737_CR36","first-page":"5765","volume":"36","author":"MJ Mirza","year":"2023","unstructured":"Mirza, M. J., Karlinsky, L., Lin, W., Possegger, H., Kozinski, M., Feris, R., & Bischof, H. (2023). Lafter: Label-free tuning of zero-shot classifier using language and unlabeled image collections. Advances in Neural Information Processing Systems,36, 5765\u20135777.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2737_CR37","doi-asserted-by":"crossref","unstructured":"Narayan, S., Gupta, A., Khan, S., Khan, F.S., Shao, L., & Shah, M. (2021). Discriminative region-based multi-label zero-shot learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 8731\u20138740","DOI":"10.1109\/ICCV48922.2021.00861"},{"key":"2737_CR38","unstructured":"OpenAI. (2022). Introducing chatgpt. https:\/\/openai.com\/blog\/chatgpt"},{"key":"2737_CR39","first-page":"27730","volume":"35","author":"L Ouyang","year":"2022","unstructured":"Ouyang, L., Wu, J., Jiang, X., Almeida, D., Wainwright, C., Mishkin, P., Zhang, C., Agarwal, S., Slama, K., Ray, A., et al. (2022). Training language models to follow instructions with human feedback. Advances in neural information processing systems,35, 27730\u201327744.","journal-title":"Advances in neural information processing systems"},{"key":"2737_CR40","doi-asserted-by":"publisher","first-page":"2091","DOI":"10.1609\/aaai.v36i2.20105","volume":"36","author":"T Pu","year":"2022","unstructured":"Pu, T., Chen, T., Wu, H., & Lin, L. (2022). Semantic-aware representation blending for multi-label image recognition with partial labels. Proceedings of the AAAI Conference on Artificial Intelligence,36, 2091\u20132098.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2737_CR41","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J. et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In: International conference on machine learning, PMLR, pp 8748\u20138763"},{"key":"2737_CR42","doi-asserted-by":"crossref","unstructured":"Ridnik, T., Ben-Baruch, E., Zamir, N., Noy, A., Friedman, I., Protter, M., & Zelnik-Manor, L. (2021). Asymmetric loss for multi-label classification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 82\u201391","DOI":"10.1109\/ICCV48922.2021.00015"},{"key":"2737_CR43","doi-asserted-by":"crossref","unstructured":"Ridnik, T., Sharir, G., Ben-Cohen, A., Ben-Baruch, E., & Noy, A. (2023). Ml-decoder: Scalable and versatile classification head. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp 32\u201341","DOI":"10.1109\/WACV56688.2023.00012"},{"key":"2737_CR44","doi-asserted-by":"crossref","unstructured":"Roth, K., Kim, J.M., Koepke, A., Vinyals, O., Schmid, C., & Akata, Z. (2023). Waffling around for performance: Visual classification with random words and broad concepts. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 15746\u201315757","DOI":"10.1109\/ICCV51070.2023.01443"},{"key":"2737_CR45","doi-asserted-by":"crossref","unstructured":"Shojaei\u00a0Miandashti, H., Zou, Q., & Mehltretter, M. (2024). Uncertainty estimation and out-of-distribution detection for lidar scene semantic segmentation. In: European Conference on Computer Vision, Springer, pp 116\u2013131","DOI":"10.1007\/978-3-031-91767-7_8"},{"key":"2737_CR46","doi-asserted-by":"crossref","unstructured":"Simon, C., Koniusz, P., & Harandi, M. (2022). Meta-learning for multi-label few-shot classification. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 3951\u20133960","DOI":"10.1109\/WACV51458.2022.00042"},{"key":"2737_CR47","unstructured":"Smith, L.N. (2018). A disciplined approach to neural network hyper-parameters: Part 1\u2013learning rate, batch size, momentum, and weight decay. arXiv preprint arXiv:1803.09820"},{"issue":"1","key":"2737_CR48","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., & Salakhutdinov, R. (2014). Dropout: a simple way to prevent neural networks from overfitting. The journal of machine learning research,15(1), 1929\u20131958.","journal-title":"The journal of machine learning research"},{"key":"2737_CR49","first-page":"30569","volume":"35","author":"X Sun","year":"2022","unstructured":"Sun, X., Hu, P., & Saenko, K. (2022). Dualcoop: Fast adaptation to multi-label recognition with limited annotations. Advances in Neural Information Processing Systems,35, 30569\u201330582.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2737_CR50","unstructured":"Taori, R., Gulrajani, I., Zhang, T., Dubois, Y., Li, X., Guestrin, C., Liang, P., & Hashimoto, T.B. (2023). Stanford alpaca: An instruction-following llama model"},{"key":"2737_CR51","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., & Azhar F et\u00a0al. (2023). Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971"},{"key":"2737_CR52","doi-asserted-by":"crossref","unstructured":"Wang, J., Yang, Y., Mao, J., Huang, Z., Huang, C., & Xu, W. (2016). Cnn-rnn: A unified framework for multi-label image classification. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2285\u20132294","DOI":"10.1109\/CVPR.2016.251"},{"key":"2737_CR53","unstructured":"Wang, P., Bai, S., Tan, S., Wang, S., Fan, Z., Bai, J., Chen, K., Liu, X., Wang, J., & Ge, W., et\u00a0al. (2024). Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191"},{"key":"2737_CR54","doi-asserted-by":"crossref","unstructured":"Wang, Z., Chen, T., Li, G., Xu, R., & Lin, L. (2017). Multi-label image recognition by recurrently discovering attentional regions. In: Proceedings of the IEEE international conference on computer vision, pp 464\u2013472","DOI":"10.1109\/ICCV.2017.58"},{"issue":"9","key":"2737_CR55","doi-asserted-by":"publisher","first-page":"1901","DOI":"10.1109\/TPAMI.2015.2491929","volume":"38","author":"Y Wei","year":"2015","unstructured":"Wei, Y., Xia, W., Lin, M., Huang, J., Ni, B., Dong, J., Zhao, Y., & Yan, S. (2015). Hcp: A flexible cnn framework for multi-label image classification. IEEE transactions on pattern analysis and machine intelligence,38(9), 1901\u20131907.","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"2737_CR56","unstructured":"Wu, X., Jiang, Q.Y., Yang, Y., Wu, Y.F., Chen, Q.G., & Lu, J. (2024). Tai++: Text as image for multi-label image classification by co-learning transferable prompt. arXiv preprint arXiv:2405.06926"},{"key":"2737_CR57","doi-asserted-by":"crossref","unstructured":"Yamaguchi, S., Feng, D., Kanai, S., Adachi, K.,& Chijiwa, D. (2025). Post-pre-training for modality alignment in vision-language foundation models. In: Proceedings of the Computer Vision and Pattern Recognition Conference, pp 4256\u20134266","DOI":"10.1109\/CVPR52734.2025.00402"},{"key":"2737_CR58","unstructured":"Yazici, V.O., Gonzalez-Garcia, A., Ramisa, A., Twardowski, B., Weijer, J.v.d. (2020). Orderless recurrent models for multi-label classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 13440\u201313449"},{"key":"2737_CR59","doi-asserted-by":"crossref","unstructured":"Ye, J., He, J., Peng, X., Wu, W., & Qiao, Y. (2020). Attention-driven dynamic graph convolutional network for multi-label image recognition. In: European Conference on Computer Vision, Springer, pp 649\u2013665","DOI":"10.1007\/978-3-030-58589-1_39"},{"key":"2737_CR60","doi-asserted-by":"crossref","unstructured":"Yi, C., He, Y. H., Zhan, D. C., & Ye, H. J. (2024). Bridge the modality and capability gaps in vision-language model selection. Advances in Neural Information Processing Systems,37, 34429\u201334452.","DOI":"10.52202\/079017-1085"},{"key":"2737_CR61","doi-asserted-by":"publisher","first-page":"12709","DOI":"10.1609\/aaai.v34i07.6964","volume":"34","author":"R You","year":"2020","unstructured":"You, R., Guo, Z., Cui, L., Long, X., Bao, Y., & Wen, S. (2020). Cross-modality attention with semantic graph embedding for multi-label classification. Proceedings of the AAAI Conference on Artificial Intelligence,34, 12709\u201312716.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2737_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Huang, X., Ma, J., Li, Z., Luo, Z., Xie, Y., Qin, Y., Luo, T., Li, Y., & Liu, S., et\u00a0al. (2023). Recognize anything: A strong image tagging model. arXiv preprint arXiv:2306.03514","DOI":"10.1109\/CVPRW63382.2024.00179"},{"key":"2737_CR63","doi-asserted-by":"crossref","unstructured":"Zhao, J., Yan, K., Zhao, Y., Guo, X., Huang, F., & Li, J. (2021). Transformer-based dual relation graph for multi-label image recognition. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 163\u2013172","DOI":"10.1109\/ICCV48922.2021.00023"},{"key":"2737_CR64","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., & Liu, Z. (2022a). Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 16816\u201316825","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"2737_CR65","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022). Learning to prompt for vision-language models. International Journal of Computer Vision,130(9), 2337\u20132348.","journal-title":"International Journal of Computer Vision"},{"key":"2737_CR66","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., & Elhoseiny, M. (2023a). Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592"},{"key":"2737_CR67","doi-asserted-by":"crossref","unstructured":"Zhu, K., & Wu, J. (2021). Residual attention: A simple but effective method for multi-label recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 184\u2013193","DOI":"10.1109\/ICCV48922.2021.00025"},{"key":"2737_CR68","doi-asserted-by":"crossref","unstructured":"Zhu, X., Cao, J., Ge, J., Liu, W., & Liu, B. (2022). Two-stream transformer for multi-label image classification. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 3598\u20133607","DOI":"10.1145\/3503161.3548343"},{"key":"2737_CR69","doi-asserted-by":"crossref","unstructured":"Zhu, X., Liu, J., Liu, W., Ge, J., Liu, B., & Cao, J. (2023b). Scene-aware label graph learning for multi-label image classification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 1473\u20131482","DOI":"10.1109\/ICCV51070.2023.00142"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02737-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02737-y","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02737-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:34:31Z","timestamp":1774600471000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02737-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,6]]},"references-count":69,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2737"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02737-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,6]]},"assertion":[{"value":"4 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of Interest"}}],"article-number":"110"}}