{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,2]],"date-time":"2026-03-02T10:12:22Z","timestamp":1772446342760,"version":"3.50.1"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2024,12,27]],"date-time":"2024-12-27T00:00:00Z","timestamp":1735257600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,27]],"date-time":"2024-12-27T00:00:00Z","timestamp":1735257600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/100007847","name":"Natural Science Foundation of Jilin Province","doi-asserted-by":"publisher","award":["20230201082GX"],"award-info":[{"award-number":["20230201082GX"]}],"id":[{"id":"10.13039\/100007847","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s00371-024-03769-6","type":"journal-article","created":{"date-parts":[[2024,12,27]],"date-time":"2024-12-27T14:17:57Z","timestamp":1735309077000},"page":"6027-6039","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Open-vocabulary multi-label classification with visual and textual features fusion"],"prefix":"10.1007","volume":"41","author":[{"given":"Tongtong","family":"Liu","sequence":"first","affiliation":[]},{"given":"Chen","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Guoqiang","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Wenhui","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,27]]},"reference":[{"key":"3769_CR1","doi-asserted-by":"crossref","unstructured":"Li, X., Liao, S., Lan, W., Du, X., Yang, G.: Zero-shot image tagging by hierarchical semantic embedding. In: Proceedings of the 38th international ACM SIGIR conference on research and development in information retrieval, pp. 879\u2013882 (2015)","DOI":"10.1145\/2766462.2767773"},{"key":"3769_CR2","doi-asserted-by":"crossref","unstructured":"Lanchantin, J., Wang, T., Ordonez, V., Qi, Y.: General multi-label image classification with transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 16478\u201316488 (2021)","DOI":"10.1109\/CVPR46437.2021.01621"},{"key":"3769_CR3","doi-asserted-by":"crossref","unstructured":"Ridnik, T., Ben-Baruch, E., Zamir, N., Noy, A., Friedman, I., Protter, M., Zelnik-Manor, L.: Asymmetric loss for multi-label classification. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 82\u201391 (2021)","DOI":"10.1109\/ICCV48922.2021.00015"},{"key":"3769_CR4","doi-asserted-by":"crossref","unstructured":"Zhou, F., Huang, S., Xing, Y.: Deep semantic dictionary learning for multi-label image classification. In: Proceedings of the AAAI conference on artificial intelligence, 35, pp. 3572\u20133580 (2021)","DOI":"10.1609\/aaai.v35i4.16472"},{"key":"3769_CR5","doi-asserted-by":"crossref","unstructured":"Zhu, X., Cao, J., Ge, J., Liu, W., Liu, B.: Two-stream transformer for multi-label image classification. In: Proceedings of the 30th ACM international conference on multimedia, pp. 3598\u20133607 (2022)","DOI":"10.1145\/3503161.3548343"},{"key":"3769_CR6","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1016\/j.neucom.2023.01.018","volume":"526","author":"T Pu","year":"2023","unstructured":"Pu, T., Sun, M., Wu, H., Chen, T., Tian, L., Lin, L.: Semantic representation and dependency learning for multi-label image recognition. Neurocomputing 526, 121\u2013130 (2023)","journal-title":"Neurocomputing"},{"key":"3769_CR7","doi-asserted-by":"crossref","unstructured":"Ben-Cohen, A., Zamir, N., Ben-Baruch, E., Friedman, I., Zelnik-Manor, L.: Semantic diversity learning for zero-shot multi-label classification. In: CVPR, pp. 640\u2013650 (2021)","DOI":"10.1109\/ICCV48922.2021.00068"},{"key":"3769_CR8","doi-asserted-by":"crossref","unstructured":"Huynh, D., Elhamifar, E.: A shared multi-attention framework for multi-label zero-shot learning. In: CVPR, pp. 8776\u20138786 (2020)","DOI":"10.1109\/CVPR42600.2020.00880"},{"key":"3769_CR9","doi-asserted-by":"publisher","first-page":"6549","DOI":"10.1109\/TIP.2020.2991527","volume":"29","author":"Z Ji","year":"2020","unstructured":"Ji, Z., Cui, B., Li, H., Jiang, Y.-G., Xiang, T., Hospedales, T., Fu, Y.: Deep ranking for image zero-shot multi-label classification. IEEE Trans. Image Process. 29, 6549\u20136560 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"3769_CR10","doi-asserted-by":"crossref","unstructured":"Narayan, S., Gupta, A., Khan, S., Khan, F.S., Shao, L., Shah, M.: Discriminative region-based multi-label zero-shot learning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 8731\u20138740 (2021)","DOI":"10.1109\/ICCV48922.2021.00861"},{"key":"3769_CR11","doi-asserted-by":"publisher","first-page":"7441","DOI":"10.1109\/TMM.2022.3222657","volume":"25","author":"Z Liu","year":"2022","unstructured":"Liu, Z., Guo, S., Guo, J., Xu, Y., Huo, F.: Towards unbiased multi-label zero-shot learning with pyramid and semantic attention. IEEE Trans. Multimedia 25, 7441\u20137455 (2022)","journal-title":"IEEE Trans. Multimedia"},{"key":"3769_CR12","doi-asserted-by":"crossref","unstructured":"Liu, Z., Guo, S., Lu, X., Guo, J., Zhang, J., Zeng, Y., Huo, F.: $$\\left( ml \\right) ^2$$ p-encoder: On exploration of channel-class correlation for multi-label zero-shot learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 23859\u201323868 (2023)","DOI":"10.1109\/CVPR52729.2023.02285"},{"key":"3769_CR13","doi-asserted-by":"crossref","unstructured":"Fu, Y., Yang, Y., Hospedales, T., Xiang, T., Gong, S.: Transductive multi-label zero-shot learning. arXiv preprint arXiv:1503.07790 (2015)","DOI":"10.1109\/TPAMI.2015.2408354"},{"key":"3769_CR14","unstructured":"Gaure, A., Gupta, A., Verma, V.K., Rai, P.: A probabilistic framework for zero-shot multi-label learning. In: The Conference on Uncertainty in Artificial Intelligence (UAI), vol. 1, p. 3 (2017)"},{"key":"3769_CR15","doi-asserted-by":"crossref","unstructured":"Gupta, A., Narayan, S., Khan, S., Khan, F.S., Shao, L., Weijer, J.: Generative multi-label zero-shot learning. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)","DOI":"10.1109\/TPAMI.2023.3295772"},{"key":"3769_CR16","doi-asserted-by":"publisher","first-page":"135","DOI":"10.1162\/tacl_a_00051","volume":"5","author":"P Bojanowski","year":"2017","unstructured":"Bojanowski, P., Grave, E., Joulin, A., Mikolov, T.: Enriching word vectors with subword information. Trans. Assoc. Comput. Linguistics 5, 135\u2013146 (2017)","journal-title":"Trans. Assoc. Comput. Linguistics"},{"key":"3769_CR17","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: Global vectors for word representation. In: EMNLP, pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"3769_CR18","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: 3rd international conference on learning representations (ICLR 2015) (2015). computational and biological learning society"},{"key":"3769_CR19","doi-asserted-by":"publisher","first-page":"2226","DOI":"10.1109\/TMM.2022.3144890","volume":"25","author":"N Jiang","year":"2022","unstructured":"Jiang, N., Sheng, B., Li, P., Lee, T.-Y.: Photohelper: portrait photographing guidance via deep feature retrieval and fusion. IEEE Trans. Multimedia 25, 2226\u20132238 (2022)","journal-title":"IEEE Trans. Multimedia"},{"key":"3769_CR20","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021). PMLR"},{"key":"3769_CR21","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916 (2021). PMLR"},{"key":"3769_CR22","doi-asserted-by":"crossref","unstructured":"Mu, N., Kirillov, A., Wagner, D., Xie, S.: Slip: Self-supervision meets language-image pre-training. In: ECCV, pp. 529\u2013544 (2022). Springer","DOI":"10.1007\/978-3-031-19809-0_30"},{"key":"3769_CR23","doi-asserted-by":"crossref","unstructured":"He, S., Guo, T., Dai, T., Qiao, R., Shu, X., Ren, B., Xia, S.-T.: Open-vocabulary multi-label classification via multi-modal knowledge transfer. In: Proceedings of the AAAI conference on artificial intelligence, 37, pp. 808\u2013816 (2023)","DOI":"10.1609\/aaai.v37i1.25159"},{"key":"3769_CR24","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. In: international conference on learning representations (2020)"},{"key":"3769_CR25","doi-asserted-by":"crossref","unstructured":"Dao, S.D., Huynh, D., Zhao, H., Phung, D., Cai, J.: Open-vocabulary multi-label image classification with pretrained vision-language model. In: 2023 IEEE International Conference on Multimedia and Expo (ICME), pp. 2135\u20132140 (2023). IEEE","DOI":"10.1109\/ICME55011.2023.00365"},{"key":"3769_CR26","doi-asserted-by":"crossref","unstructured":"Mensink, T., Gavves, E., Snoek, C.G.: Costa: Co-occurrence statistics for zero-shot classification. In: CVPR, pp. 2441\u20132448 (2014)","DOI":"10.1109\/CVPR.2014.313"},{"key":"3769_CR27","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Gong, B., Shah, M.: Fast zero-shot image tagging. In: CVPR, pp. 5985\u20135994 (2016). IEEE","DOI":"10.1109\/CVPR.2016.644"},{"key":"3769_CR28","unstructured":"Kenton, J.D.M.-W.C., Toutanova, L.K.: Bert: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of naacL-HLT, 1, p. 2 (2019)"},{"key":"3769_CR29","doi-asserted-by":"crossref","unstructured":"Chen, Y.-C., Li, L., Yu, L., El\u00a0Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., Liu, J.: Uniter: Learning universal image-text representations (2019)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"3769_CR30","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: Simvlm: Simple visual language model pretraining with weak supervision. In: International Conference on Learning Representations (2021)"},{"key":"3769_CR31","doi-asserted-by":"crossref","unstructured":"Wang, Z., Lu, Y., Li, Q., Tao, X., Guo, Y., Gong, M., Liu, T.: Cris: Clip-driven referring image segmentation. In: CVPR, pp. 11686\u201311695 (2022)","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"3769_CR32","doi-asserted-by":"crossref","unstructured":"Tan, H., Bansal, M.: Lxmert: Learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP), pp. 5100\u20135111 (2019)","DOI":"10.18653\/v1\/D19-1514"},{"key":"3769_CR33","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32 (2019)"},{"key":"3769_CR34","doi-asserted-by":"crossref","unstructured":"Patashnik, O., Wu, Z., Shechtman, E., Cohen-Or, D., Lischinski, D.: Styleclip: Text-driven manipulation of stylegan imagery. In: CVPR, pp. 2085\u20132094 (2021)","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"3769_CR35","doi-asserted-by":"crossref","unstructured":"Tang, M., Wang, Z., Liu, Z., Rao, F., Li, D., Li, X.: Clip4caption: Clip for video caption. In: Proceedings of the 29th ACM international conference on multimedia, pp. 4858\u20134862 (2021)","DOI":"10.1145\/3474085.3479207"},{"key":"3769_CR36","doi-asserted-by":"crossref","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., Xie, W.: Prompting visual-language models for efficient video understanding. In: European Conference on Computer Vision, pp. 105\u2013124 (2022). Springer","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"3769_CR37","doi-asserted-by":"crossref","unstructured":"Wang, N., Xie, J., Luo, H., Cheng, Q., Wu, J., Jia, M., Li, L.: Efficient image captioning for edge devices. Proceedings of the AAAI Conference on Artificial Intelligence 37, 2608\u20132616 (2023)","DOI":"10.1609\/aaai.v37i2.25359"},{"key":"3769_CR38","doi-asserted-by":"crossref","unstructured":"Ridnik, T., Sharir, G., Ben-Cohen, A., Ben-Baruch, E., Noy, A.: Ml-decoder: Scalable and versatile classification head. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 32\u201341 (2023)","DOI":"10.1109\/WACV56688.2023.00012"},{"key":"3769_CR39","doi-asserted-by":"crossref","unstructured":"Ali, M., Khan, S.: Clip-decoder: Zeroshot multilabel classification using multimodal clip aligned representations. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 4675\u20134679 (2023)","DOI":"10.1109\/ICCVW60793.2023.00505"},{"key":"3769_CR40","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Advances in neural information processing systems 30 (2017)"},{"key":"3769_CR41","doi-asserted-by":"crossref","unstructured":"Chua, T.-S., Tang, J., Hong, R., Li, H., Luo, Z., Zheng, Y.: Nus-wide: a real-world web image database from national university of singapore. In: Proceedings of the ACM international conference on image and video retrieval, pp. 1\u20139 (2009)","DOI":"10.1145\/1646396.1646452"},{"issue":"7","key":"3769_CR42","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A., Rom, H., Alldrin, N., Uijlings, J., Krasin, I., Pont-Tuset, J., Kamali, S., Popov, S., Malloci, M., Kolesnikov, A., et al.: The open images dataset v4. Int. J. Comput. Vision 128(7), 1956\u20131981 (2020)","journal-title":"Int. J. Comput. Vision"},{"key":"3769_CR43","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"3769_CR44","unstructured":"Norouzi, M., Mikolov, T., Bengio, S., Singer, Y., Shlens, J., Frome, A., Corrado, G.S., Dean, J.: Zero-shot learning by convex combination of semantic embeddings. In: 2nd international conference on learning representations, ICLR 2014 (2014)"},{"issue":"7","key":"3769_CR45","doi-asserted-by":"publisher","first-page":"1425","DOI":"10.1109\/TPAMI.2015.2487986","volume":"38","author":"Z Akata","year":"2015","unstructured":"Akata, Z., Perronnin, F., Harchaoui, Z., Schmid, C.: Label-embedding for image classification. IEEE Trans. Pattern Anal. Mach. Intell. 38(7), 1425\u20131438 (2015)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3769_CR46","unstructured":"Kim, J.-H., Jun, J., Zhang, B.-T.: Bilinear attention networks. Advances in neural information processing systems 31 (2018)"},{"key":"3769_CR47","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03769-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-024-03769-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03769-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,16]],"date-time":"2025-05-16T19:03:16Z","timestamp":1747422196000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-024-03769-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,27]]},"references-count":47,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["3769"],"URL":"https:\/\/doi.org\/10.1007\/s00371-024-03769-6","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,27]]},"assertion":[{"value":"13 December 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 December 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declared that they have no Conflict of interest relevant to the content of this work. We declare that we have no known competing financial interests or personal relationships in connection with the work submitted.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}