{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T18:23:34Z","timestamp":1772907814350,"version":"3.50.1"},"reference-count":117,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2024,6,13]],"date-time":"2024-06-13T00:00:00Z","timestamp":1718236800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,6,13]],"date-time":"2024-06-13T00:00:00Z","timestamp":1718236800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2024,11]]},"DOI":"10.1007\/s11263-024-02144-1","type":"journal-article","created":{"date-parts":[[2024,6,13]],"date-time":"2024-06-13T07:01:45Z","timestamp":1718262105000},"page":"5387-5409","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["OV-DAR: Open-Vocabulary Object Detection and Attributes Recognition"],"prefix":"10.1007","volume":"132","author":[{"given":"Keyan","family":"Chen","sequence":"first","affiliation":[]},{"given":"Xiaolong","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Haochen","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Cilin","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Yan","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Xu","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Yao","family":"Hu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3804-2639","authenticated-orcid":false,"given":"Weidi","family":"Xie","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,6,13]]},"reference":[{"issue":"11","key":"2144_CR1","doi-asserted-by":"publisher","first-page":"1949","DOI":"10.1109\/TMM.2015.2477680","volume":"17","author":"AH Abdulnabi","year":"2015","unstructured":"Abdulnabi, A. H., Wang, G., Lu, J., & Jia, K. (2015). Multi-task cnn model for attribute prediction. IEEE Transactions on Multimedia, 17(11), 1949\u20131959.","journal-title":"IEEE Transactions on Multimedia"},{"issue":"4","key":"2144_CR2","volume":"10","author":"J Agnese","year":"2020","unstructured":"Agnese, J., Herrera, J., Tao, H., & Zhu, X. (2020). A survey and taxonomy of adversarial neural networks for text-to-image synthesis. Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery, 10(4), e1345.","journal-title":"Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery"},{"key":"2144_CR3","doi-asserted-by":"crossref","unstructured":"Akir Hossain, M. D., Sohel, F., Shiratuddin, M. F., & Laga, H. (2019). A comprehensive survey of deep learning for image captioning. ACM Computing Surveys (CsUR), 51(6), 1\u201336.","DOI":"10.1145\/3295748"},{"key":"2144_CR4","first-page":"23716","volume":"35","author":"J-B Alayrac","year":"2022","unstructured":"Alayrac, J.-B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., et al. (2022). Flamingo: A visual language model for few-shot learning. Advances in Neural Information Processing Systems, 35, 23716\u201323736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2144_CR5","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.\u00a0L., & Parikh, D. (2015). Vqa: Visual question answering. In Proceedings of the IEEE international conference on computer vision, pp. 2425\u20132433.","DOI":"10.1109\/ICCV.2015.279"},{"key":"2144_CR6","doi-asserted-by":"crossref","unstructured":"Arbel\u00e1ez, P., Pont-Tuset, J., Barron, J.\u00a0T., Marques, F., & Malik, J. (2014). Multiscale combinatorial grouping. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 328\u2013335.","DOI":"10.1109\/CVPR.2014.49"},{"key":"2144_CR7","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1016\/j.neucom.2018.05.080","volume":"311","author":"S Bai","year":"2018","unstructured":"Bai, S., & An, S. (2018). A survey on automatic image caption generation. Neurocomputing, 311, 291\u2013304.","journal-title":"Neurocomputing"},{"key":"2144_CR8","unstructured":"Bangalath, H., Maaz, M., Khattak, M.\u00a0U., Khan, S.\u00a0H., & Shahbaz\u00a0Khan, F. Bridging the gap between object and image-level representations for open-vocabulary detection. Advances in Neural Information Processing Systems, 35:33781\u201333794."},{"key":"2144_CR9","doi-asserted-by":"crossref","unstructured":"Bansal, A., Sikka, K., Sharma, G., Chellappa, R., & Divakaran, A. (2018). Zero-shot object detection. In Proceedings of the European conference on computer vision (ECCV), pp. 384\u2013400.","DOI":"10.1007\/978-3-030-01246-5_24"},{"key":"2144_CR10","doi-asserted-by":"crossref","unstructured":"Bravo, M.\u00a0A., Mittal, S., & Brox, T. (2022). Localized vision-language matching for open-vocabulary object detection. In DAGM German conference on pattern recognition, pp. 393\u2013408. Springer.","DOI":"10.1007\/978-3-031-16788-1_24"},{"key":"2144_CR11","doi-asserted-by":"crossref","unstructured":"Bravo, M.\u00a0A., Mittal, S., Ging, S., & Brox, T. (2023). Open-vocabulary attribute detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 7041\u20137050.","DOI":"10.1109\/CVPR52729.2023.00680"},{"key":"2144_CR12","doi-asserted-by":"crossref","unstructured":"Cai, Z., & Vasconcelos, N. (2018). Cascade r-cnn: Delving into high quality object detection. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 6154\u20136162.","DOI":"10.1109\/CVPR.2018.00644"},{"issue":"7","key":"2144_CR13","doi-asserted-by":"publisher","first-page":"1312","DOI":"10.1109\/TPAMI.2011.231","volume":"34","author":"J Carreira","year":"2011","unstructured":"Carreira, J., & Sminchisescu, C. (2011). CPMC: Automatic object segmentation using constrained parametric min-cuts. IEEE Transactions on Pattern Analysis and Machine Intelligence, 34(7), 1312\u20131328.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2144_CR14","unstructured":"Chavan, A., Liu, Z., Gupta, D., Xing, E., & Shen, Z. (2023). One-for-all: Generalized lora for parameter-efficient fine-tuning. arXiv preprint arXiv:2306.07967."},{"key":"2144_CR15","unstructured":"Chen, X., Fang, H., Lin, T.-Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., & Zitnick, C. Lawrence. (2015). Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325."},{"key":"2144_CR16","doi-asserted-by":"crossref","unstructured":"Chen, H., Gallagher, A., & Girod, B. (2012). Describing clothing by semantic attributes. In Computer Vision\u2013ECCV 2012: 12th European conference on computer vision, Florence, Italy, October 7-13, 2012, Proceedings, Part III 12, pp. 609\u2013623. Springer.","DOI":"10.1007\/978-3-642-33712-3_44"},{"key":"2144_CR17","unstructured":"Chen, F., Han, M., Zhao, H., Zhang, Q., Shi, J., Xu, S., & Xu, B. (2023). X-llm: Bootstrapping advanced large language models by treating multi-modalities as foreign languages. arXiv preprint arXiv:2305.04160."},{"key":"2144_CR18","doi-asserted-by":"crossref","unstructured":"Chen, K., Jiang, X., Hu, Y., Tang, X., Gao, Y., Chen, J., Xie, W. (2023). Ovarnet: Towards open-vocabulary object attribute recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 23518\u201323527.","DOI":"10.1109\/CVPR52729.2023.02252"},{"key":"2144_CR19","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.\u00a0G., Kirillov, A., & Girdhar, R. (2022). Masked-attention mask transformer for universal image segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 1290\u20131299.","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"2144_CR20","doi-asserted-by":"crossref","unstructured":"Cheng, B., Wei, Y., Shi, H., Feris, R., Xiong, J., & Huang, T. (2018). Revisiting rcnn: On awakening the classification power of faster rcnn. In Proceedings of the European conference on computer vision (ECCV), pp. 453\u2013468.","DOI":"10.1007\/978-3-030-01267-0_28"},{"key":"2144_CR21","doi-asserted-by":"crossref","unstructured":"Cherti, M., Beaumont, R., Wightman, R., Wortsman, M., Ilharco, G., Gordon, C., Schuhmann, C., Schmidt, L., & Jitsev, J. (2023). Reproducible scaling laws for contrastive language-image learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 2818\u20132829.","DOI":"10.1109\/CVPR52729.2023.00276"},{"issue":"5","key":"2144_CR22","doi-asserted-by":"publisher","first-page":"823","DOI":"10.1080\/01431160600746456","volume":"28","author":"L Dengsheng","year":"2007","unstructured":"Dengsheng, L., & Weng, Q. (2007). A survey of image classification methods and techniques for improving classification performance. International Journal of Remote Sensing, 28(5), 823\u2013870.","journal-title":"International Journal of Remote Sensing"},{"issue":"3","key":"2144_CR23","doi-asserted-by":"publisher","first-page":"220","DOI":"10.1038\/s42256-023-00626-4","volume":"5","author":"N Ding","year":"2023","unstructured":"Ding, N., Qin, Y., Yang, G., Wei, F., Yang, Z., Yusheng, S., Shengding, H., Chen, Y., Chan, C.-M., Chen, W., et al. (2023). Parameter-efficient fine-tuning of large-scale pre-trained language models. Nature Machine Intelligence, 5(3), 220\u2013235.","journal-title":"Nature Machine Intelligence"},{"key":"2144_CR24","doi-asserted-by":"crossref","unstructured":"Du, Y., Wei, F., Zhang, Z., Shi, M., Gao, Y., & Li, G. (2022). Learning to prompt for open-vocabulary object detection with vision-language model. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 14084\u201314093.","DOI":"10.1109\/CVPR52688.2022.01369"},{"key":"2144_CR25","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Endres, I., Hoiem, D., & Forsyth, D. (2009). Describing objects by their attributes. In 2009 IEEE conference on computer vision and pattern recognition, pp. 1778\u20131785. IEEE.","DOI":"10.1109\/CVPR.2009.5206772"},{"key":"2144_CR26","doi-asserted-by":"crossref","unstructured":"Feng, C., Zhong, Y., Jie, Z., Chu, X., Ren, H., Wei, X., Xie, W., & Ma, L. (2022) Promptdet: Towards open-vocabulary detection using uncurated images. In European Conference on Computer Vision, pp. 701\u2013717. Springer.","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"2144_CR27","unstructured":"Ferrari, V., & Zisserman, A. (2007) Learning visual attributes. In Advances in neural information processing systems, 20."},{"key":"2144_CR28","unstructured":"Gadre, S.\u00a0Y., Ilharco, G., Fang, A., Hayase, J., Smyrnis, G., Nguyen, T., Marten, R., Wortsman, M., Ghosh, D., Zhang, J., et\u00a0al. (2023). Datacomp: In search of the next generation of multimodal datasets. arXiv preprint arXiv:2304.14108."},{"key":"2144_CR29","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., Gu, X., Cui, Y., & Lin, T.-Y. (2022). Scaling open-vocabulary image segmentation with image-level labels. In European conference on computer vision, pp. 540\u2013557. Springer.","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"2144_CR30","doi-asserted-by":"crossref","unstructured":"Girshick, R. (2015) Fast r-cnn. In Proceedings of the IEEE international conference on computer vision, pp. 1440\u20131448.","DOI":"10.1109\/ICCV.2015.169"},{"key":"2144_CR31","unstructured":"Gong, T., Lyu, C., Zhang, S., Wang, Y., Zheng, M., Zhao, Q., Liu, K., Zhang, W., Luo, P., & Chen, K. (2023). Multimodal-gpt: A vision and language model for dialogue with humans. arXiv preprint arXiv:2305.04790."},{"key":"2144_CR32","unstructured":"Gu, X., Lin, T.-Y., Kuo, W., & Cui, Y. (2021). Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921."},{"key":"2144_CR33","doi-asserted-by":"crossref","unstructured":"Guo, S., Huang, W., Zhang, X., Srikhanta, P., Cui, Y., Li, Y., Adam, H., Scott, M.\u00a0R., & Belongie, S. (2019). The imaterialist fashion attribute dataset. In Proceedings of the IEEE\/CVF international conference on computer vision workshops, 0.","DOI":"10.1109\/ICCVW.2019.00377"},{"key":"2144_CR34","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1016\/j.neucom.2015.09.116","volume":"187","author":"Y Guo","year":"2016","unstructured":"Guo, Y., Liu, Y., Oerlemans, A., Lao, S., Wu, S., & Lew, M. S. (2016). Deep learning for visual understanding: A review. Neurocomputing, 187, 27\u201348.","journal-title":"Neurocomputing"},{"key":"2144_CR35","doi-asserted-by":"crossref","unstructured":"Hafiz, A. M., & Bhat, G. M. (2020). A survey on instance segmentation: state of the art. International Journal of Multimedia Information Retrieval, 9(3), 171\u2013189.","DOI":"10.1007\/s13735-020-00195-x"},{"key":"2144_CR36","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., & Girshick, R. (2022). Masked autoencoders are scalable vision learners. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 16000\u201316009.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"2144_CR37","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask r-cnn. In Proceedings of the IEEE international conference on computer vision, pp. 2961\u20132969.","DOI":"10.1109\/ICCV.2017.322"},{"key":"2144_CR38","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition, 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"issue":"6","key":"2144_CR39","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1109\/MSP.2017.2741510","volume":"34","author":"X He","year":"2017","unstructured":"He, X., & Deng, L. (2017). Deep learning for image-to-text generation: A technical overview. IEEE Signal Processing Magazine, 34(6), 109\u2013116.","journal-title":"IEEE Signal Processing Magazine"},{"key":"2144_CR40","unstructured":"Hu, E.\u00a0J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2021). Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685."},{"key":"2144_CR41","doi-asserted-by":"publisher","first-page":"4013","DOI":"10.1109\/TIP.2020.2969330","volume":"29","author":"Y Huang","year":"2020","unstructured":"Huang, Y., Chen, J., Ouyang, W., Wan, W., & Xue, Y. (2020). Image captioning with end-to-end attribute detection and subsequent attributes prediction. IEEE Transactions on Image processing, 29, 4013\u20134026.","journal-title":"IEEE Transactions on Image processing"},{"key":"2144_CR42","doi-asserted-by":"crossref","unstructured":"Hudson, D.\u00a0A., & Manning, C.\u00a0D. (2019). Gqa: A new dataset for real-world visual reasoning and compositional question answering. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6700\u20136709.","DOI":"10.1109\/CVPR.2019.00686"},{"key":"2144_CR43","doi-asserted-by":"crossref","unstructured":"Huynh, D., Kuen, J., Lin, Z., Gu, J., & Elhamifar, E. (2022). Open-vocabulary instance segmentation via robust cross-modal pseudo-labeling. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 7020\u20137031 .","DOI":"10.1109\/CVPR52688.2022.00689"},{"issue":"6","key":"2144_CR44","doi-asserted-by":"publisher","first-page":"2515","DOI":"10.1016\/j.jksuci.2020.04.001","volume":"34","author":"T Iqbal","year":"2022","unstructured":"Iqbal, T., & Qureshi, S. (2022). The survey: Text generation models in deep learning. Journal of King Saud University-Computer and Information Sciences, 34(6), 2515\u20132528.","journal-title":"Journal of King Saud University-Computer and Information Sciences"},{"key":"2144_CR45","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning, pp. 4904\u20134916."},{"key":"2144_CR46","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.cviu.2017.06.005","volume":"163","author":"K Kafle","year":"2017","unstructured":"Kafle, K., & Kanan, C. (2017). Visual question answering: Datasets, algorithms, and future challenges. Computer Vision and Image Understanding, 163, 3\u201320.","journal-title":"Computer Vision and Image Understanding"},{"key":"2144_CR47","doi-asserted-by":"crossref","unstructured":"Karkkainen, K., & Joo, J. (2021) Fairface: Face attribute dataset for balanced race, gender, and age for bias measurement and mitigation. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp. 1548\u20131558.","DOI":"10.1109\/WACV48630.2021.00159"},{"key":"2144_CR48","doi-asserted-by":"crossref","unstructured":"Kirillov, A, Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A.\u00a0C., Lo, W.-Y., et\u00a0al. (2023). Segment anything. arXiv preprint arXiv:2304.02643 (2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2144_CR49","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.-J., Shamma, D. A., et al. (2017). Visual genome: Connecting language and vision using crowdsourced dense image annotations. International Journal of Computer Vision, 123, 32\u201373.","journal-title":"International Journal of Computer Vision"},{"issue":"1\u20132","key":"2144_CR50","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1002\/nav.3800020109","volume":"2","author":"HW Kuhn","year":"1955","unstructured":"Kuhn, H. W. (1955). The Hungarian method for the assignment problem. Naval Research Logistics Quarterly, 2(1\u20132), 83\u201397.","journal-title":"Naval Research Logistics Quarterly"},{"key":"2144_CR51","unstructured":"Kuo, W., Cui, Y., Gu, X., Piergiovanni, A. J., & Angelova, A. (2022). F-vlm: Open-vocabulary object detection upon frozen vision and language models. arXiv preprint arXiv:2209.15639 ."},{"key":"2144_CR52","doi-asserted-by":"crossref","unstructured":"Lai, X., Tian, Z., Chen, Y., Li, Y., Yuan, Y., Liu, S., & Jia, J. (2023). Lisa: Reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692.","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"2144_CR53","doi-asserted-by":"crossref","unstructured":"Lampert, C.\u00a0H., Nickisch, H., & Harmeling, S. (2009). Learning to detect unseen object classes by between-class attribute transfer. In 2009 IEEE conference on computer vision and pattern recognition, pp. 951\u2013958. IEEE.","DOI":"10.1109\/CVPRW.2009.5206594"},{"issue":"3","key":"2144_CR54","doi-asserted-by":"publisher","first-page":"453","DOI":"10.1109\/TPAMI.2013.140","volume":"36","author":"CH Lampert","year":"2013","unstructured":"Lampert, C. H., Nickisch, H., & Harmeling, S. (2013). Attribute-based classification for zero-shot visual object categorization. IEEE Transactions on Pattern Analysis and Machine Intelligence, 36(3), 453\u2013465.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2144_CR55","doi-asserted-by":"crossref","unstructured":"Lee, J., Bang, J., & Yang, S.-I. (2017). Object detection with sliding window in images including multiple similar objects. In 2017 international conference on information and communication technology convergence (ICTC), pp. 803\u2013806. IEEE.","DOI":"10.1109\/ICTC.2017.8190786"},{"key":"2144_CR56","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., & Constant, N. (2021). The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691.","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"2144_CR57","doi-asserted-by":"crossref","unstructured":"Li, X.\u00a0L., & Liang, P.(2021). Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190.","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"2144_CR58","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023). Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597."},{"key":"2144_CR59","unstructured":"Li, J., Li, D., Xiong, C., & Hoi, S. (2022). Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning, pp. 12888\u201312900."},{"key":"2144_CR60","unstructured":"Lialin, V., Deshpande, V., & Rumshisky, A. (2023). Scaling down to scale up: A guide to parameter-efficient fine-tuning. arXiv preprint arXiv:2303.15647."},{"key":"2144_CR61","doi-asserted-by":"crossref","unstructured":"Liang, F., Wu, B., Dai, X., Li, K., Zhao, Y., Zhang, H., Zhang, P., Vajda, P., & Marculescu, D. (2023). Open-vocabulary semantic segmentation with mask-adapted clip. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 7061\u20137070.","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"2144_CR62","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Lawrence Zitnick, C. (2014) Microsoft coco: Common objects in context. In Computer vision\u2013ECCV 2014: 13th European conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2144_CR63","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., & Hoi, S. C. H. (2021). Align before fuse: Vision and language representation learning with momentum distillation. Advances in Neural Information Processing Systems, 34, 9694\u20139705.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2144_CR64","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.\u00a0J. (2023). Visual instruction tuning. arXiv preprint arXiv:2304.08485."},{"key":"2144_CR65","doi-asserted-by":"crossref","unstructured":"Liu, Z., Luo, P., Wang, X., & Tang, X. (2015). Deep learning face attributes in the wild. In Proceedings of the IEEE international conference on computer vision, pp. 3730\u20133738.","DOI":"10.1109\/ICCV.2015.425"},{"key":"2144_CR66","doi-asserted-by":"crossref","unstructured":"Loper, E., & Bird, S. (2002). Nltk: The natural language toolkit. arXiv preprint arXiv:cs\/0205028.","DOI":"10.3115\/1118108.1118117"},{"key":"2144_CR67","unstructured":"Loria, S., et\u00a0al. (2018). textblob documentation. Release 0.15, 2(8), 269."},{"key":"2144_CR68","unstructured":"Loshchilov, I., & Hutter, F. (2016) Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983."},{"key":"2144_CR69","unstructured":"Loshchilov, I., & Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"2144_CR70","doi-asserted-by":"crossref","unstructured":"L\u00fcddecke, T., & Ecker, A. (2022). Image segmentation using text and image prompts. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 7086\u20137096.","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"2144_CR71","unstructured":"Ma, C., Jiang, Y., Wen, X., Yuan, Z., & Qi, X. (2024). Codet: Co-occurrence guided region-word alignment for open-vocabulary object detection. Advances in neural information processing systems, 36."},{"key":"2144_CR72","doi-asserted-by":"crossref","unstructured":"Metwaly, K., Kim, A., Branson, E., & Monga, V. (2022). Glidenet: Global, local and intrinsic based dense embedding network for multi-category attributes prediction. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 4835\u20134846.","DOI":"10.1109\/CVPR52688.2022.00479"},{"key":"2144_CR73","unstructured":"OpenAI. (2023). Gpt-4 technical report."},{"key":"2144_CR74","unstructured":"Oquab, M., Darcet, T., Moutakanni, T., Vo, H., Szafraniec, M., Khalidov, V., Fernandez, P., Haziza, D., Massa, F., El-Nouby, A., et\u00a0al. (2023). Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193."},{"key":"2144_CR75","doi-asserted-by":"crossref","unstructured":"Padilla, R, Netto, S.\u00a0L., & Da\u00a0Silva, E. A. B. (2020). A survey on performance metrics for object-detection algorithms. In 2020 international conference on systems, signals and image processing (IWSSIP), pp. 237\u2013242. IEEE.","DOI":"10.1109\/IWSSIP48289.2020.9145130"},{"key":"2144_CR76","doi-asserted-by":"crossref","unstructured":"Patterson, G., & Hays, J. (2016). Coco attributes: Attributes for people, animals, and objects. In Computer Vision\u2013ECCV 2016: 14th European conference, Amsterdam, The Netherlands, 11\u201314, 2016, Proceedings, Part VI 14, pp. 85\u2013100. Springer.","DOI":"10.1007\/978-3-319-46466-4_6"},{"key":"2144_CR77","doi-asserted-by":"crossref","unstructured":"Pfeiffer, J., R\u00fcckl\u00e9, A., Poth, C., Kamath, A., Vuli\u0107, I., Ruder, S., Cho, K., & Gurevych, I. (2020). Adapterhub: A framework for adapting transformers. arXiv preprint arXiv:2007.07779.","DOI":"10.18653\/v1\/2020.emnlp-demos.7"},{"key":"2144_CR78","doi-asserted-by":"crossref","unstructured":"Pham, K., Kafle, K., Lin, Z., Ding, Z., Cohen, S., Tran, Q., & Shrivastava, A. (2022). Improving closed and open-vocabulary attribute prediction using transformers. In European conference on computer vision, pp. 201\u2013219. Springer.","DOI":"10.1007\/978-3-031-19806-9_12"},{"key":"2144_CR79","doi-asserted-by":"crossref","unstructured":"Pham, K., Kafle, K., Lin, Z., Ding, Z., Cohen, S., Tran, Q., Shrivastava, A. (2021). Learning to predict visual attributes in the wild. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 13018\u201313028.","DOI":"10.1109\/CVPR46437.2021.01282"},{"key":"2144_CR80","doi-asserted-by":"crossref","unstructured":"Plummer, B.\u00a0A., Wang, L., Cervantes, C.\u00a0M., Caicedo, J.\u00a0C., Hockenmaier, J., & Lazebnik, S. (2015). Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In Proceedings of the IEEE international conference on computer vision, pp. 2641\u20132649.","DOI":"10.1109\/ICCV.2015.303"},{"key":"2144_CR81","doi-asserted-by":"crossref","unstructured":"Pont-Tuset, J., Uijlings, J., Changpinyo, S., Soricut, R., & Ferrari, V. (2020). Connecting vision and language with localized narratives. In Computer vision\u2014ECCV 2020: 16th European conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part V 16, pp. 647\u2013664. Springer.","DOI":"10.1007\/978-3-030-58558-7_38"},{"key":"2144_CR82","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1016\/j.cviu.2017.05.001","volume":"163","author":"W Qi","year":"2017","unstructured":"Qi, W., Teney, D., Wang, P., Shen, C., Dick, A., & Van Den Hengel, A. (2017). Visual question answering: A survey of methods and datasets. Computer Vision and Image Understanding, 163, 21\u201340.","journal-title":"Computer Vision and Image Understanding"},{"key":"2144_CR83","unstructured":"Radford, A, Kim, J.\u00a0W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, pp. 8748\u20138763."},{"key":"2144_CR84","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., & Sutskever, I. (2021). Zero-shot text-to-image generation. In International conference on machine learning, pp. 8821\u20138831."},{"key":"2144_CR85","unstructured":"Redmon, J., & Farhadi, A. (2018). Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767."},{"key":"2144_CR86","doi-asserted-by":"crossref","unstructured":"Saini, N., Pham, K., & Shrivastava, A. (2022). Disentangling visual embeddings for attributes and objects. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 13658\u201313667.","DOI":"10.1109\/CVPR52688.2022.01329"},{"key":"2144_CR87","unstructured":"Shaoqing, H., Kaiming, G., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, 28."},{"key":"2144_CR88","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., & Soricut, R. (2018). Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In Proceedings of the 56th annual meeting of the association for computational linguistics (vol. 1: Long Papers), pp. 2556\u20132565.","DOI":"10.18653\/v1\/P18-1238"},{"key":"2144_CR89","unstructured":"Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556."},{"key":"2144_CR90","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104117","volume":"107","author":"R Solovyev","year":"2021","unstructured":"Solovyev, R., Wang, W., & Gabruseva, T. (2021). Weighted boxes fusion: Ensembling boxes from different object detection models. Image and Vision Computing, 107, 104117.","journal-title":"Image and Vision Computing"},{"key":"2144_CR91","doi-asserted-by":"crossref","unstructured":"Sung, Y.-L., Cho, J., & Bansal, M. (2022). Vl-adapter: Parameter-efficient transfer learning for vision-and-language tasks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 5227\u20135237.","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"2144_CR92","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/s11263-013-0620-5","volume":"104","author":"JRR Uijlings","year":"2013","unstructured":"Uijlings, J. R. R., Van De Sande, K. E. A., Gevers, T., & Smeulders, A. W. M. (2013). Selective search for object recognition. International Journal of Computer Vision, 104, 154\u2013171.","journal-title":"International Journal of Computer Vision"},{"key":"2144_CR93","unstructured":"Vu, T., Jang, H., Pham, T.\u00a0X., & Yoo, C. (2019). Cascade rpn: Delving into high-quality region proposal network with adaptive convolution. Advances in neural information processing systems, 32."},{"key":"2144_CR94","doi-asserted-by":"crossref","unstructured":"Wang, H., Zhang, Y., Yu, X., et al. (2020). Computational intelligence and neuroscience: An overview of image caption generation methods.","DOI":"10.1155\/2020\/3062706"},{"key":"2144_CR95","first-page":"21969","volume":"33","author":"X Wenjia","year":"2020","unstructured":"Wenjia, X., Xian, Y., Wang, J., Schiele, B., & Akata, Z. (2020). Attribute prototype network for zero-shot learning. Advances in Neural Information Processing Systems, 33, 21969\u201321980.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2144_CR96","doi-asserted-by":"crossref","unstructured":"Wu, C., Lin, Z., Cohen, S., Bui, T., & Maji, S. (2020). Phrasecut: Language-based image segmentation in the wild. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 10216\u201310225.","DOI":"10.1109\/CVPR42600.2020.01023"},{"key":"2144_CR97","doi-asserted-by":"crossref","unstructured":"Wu, X., Zhu, F., Zhao, R., & Li, H. (2023). Cora: Adapting clip for open-vocabulary detection with region prompting and anchor pre-matching. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 7031\u20137040.","DOI":"10.1109\/CVPR52729.2023.00679"},{"key":"2144_CR98","doi-asserted-by":"crossref","unstructured":"Xu, J., Hou, J., Zhang, Y., Feng, R., Wang, Y., Qiao, Y., & Xie, W. (2023) Learning open-vocabulary semantic segmentation models from natural language supervision. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 2935\u20132944.","DOI":"10.1109\/CVPR52729.2023.00287"},{"key":"2144_CR99","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., & De\u00a0Mello, S. (2023). Open-vocabulary panoptic segmentation with text-to-image diffusion models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 2955\u20132966.","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"2144_CR100","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., & Xu, C. (2023). Visual-language prompt tuning with knowledge-guided context optimization. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 6757\u20136767.","DOI":"10.1109\/CVPR52729.2023.00653"},{"key":"2144_CR101","unstructured":"You, H., Guo, M., Wang, Z., Chang, K.-W., Baldridge, J., & Yu, J. (2023). Cobit: A contrastive bi-directional image-text generation model. arXiv preprint arXiv:2303.13455."},{"key":"2144_CR102","unstructured":"You, H., Zhang, H., Gan, Z., Du, X., Zhang, B., Wang, Z., Cao, L., Chang, S.-F., Yang, Y. (2023). Ferret: Refer and ground anything anywhere at any granularity. arXiv preprint arXiv:2310.07704."},{"key":"2144_CR103","unstructured":"Yuan, L., Chen, D., Chen, Y.-L., Codella, N., Dai, X., Gao, J., Hu, H., Huang, X., Li, B., Li, C., et\u00a0al. (2021). Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432."},{"key":"2144_CR104","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K.\u00a0D., Hu, D.\u00a0H., & Chang, S.-F. (2021). Open-vocabulary object detection using captions. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 14393\u201314402.","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"2144_CR105","unstructured":"Zeng, Y., Zhang, X., & Li, H. (2021). Multi-grained vision language pre-training: Aligning texts with visual concepts. arXiv preprint arXiv:2111.08276."},{"key":"2144_CR106","doi-asserted-by":"crossref","unstructured":"Zhai, X., Wang, X., Mustafa, B., Steiner, A., Keysers, D., Kolesnikov, A., & Beyer, L. (2022). Lit: Zero-shot transfer with locked-image text tuning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 18123\u201318133.","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"2144_CR107","unstructured":"Zhang, J., Huang, J., Jin, S., & Lu, S. (2023). Vision-language models for vision tasks: A survey. arXiv preprint arXiv:2304.00685."},{"key":"2144_CR108","unstructured":"Zhang, C., Liu, L., Cui, Y., Huang, G., Lin, W., Yang, Y., & Hu, Y. (2023). A comprehensive survey on segment anything model for vision and beyond. arXiv preprint arXiv:2305.08196."},{"key":"2144_CR109","unstructured":"Zhang, C., Zheng, S., Li, C., Qiao, Y., Kang, T., Shan, X., Zhang, C., Qin, C., Rameau, F., Bae, S.-H., et\u00a0al. (2023). A survey on segment anything model (sam): Vision foundation model meets prompt engineering. arXiv preprint arXiv:2306.06211."},{"issue":"4","key":"2144_CR110","doi-asserted-by":"publisher","first-page":"1051","DOI":"10.1109\/TCSVT.2019.2902268","volume":"30","author":"S Zhang","year":"2019","unstructured":"Zhang, S., Song, Z., Cao, X., Zhang, H., & Zhou, J. (2019). Task-aware attention model for clothing attribute prediction. IEEE Transactions on Circuits and Systems for Video Technology, 30(4), 1051\u20131064.","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"2144_CR111","doi-asserted-by":"crossref","unstructured":"Zhao, S., Zhang, Z., Schulter, S., Zhao, L., Vijay\u00a0Kumar, B. G., Stathopoulos, A., Chandraker, M., & Metaxas, D.\u00a0N. (2022). Exploiting unlabeled data with vision and language models for object detection. In European conference on computer vision, pp. 159\u2013175. Springer.","DOI":"10.1007\/978-3-031-20077-9_10"},{"key":"2144_CR112","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L.\u00a0H., Zhou, L., Dai, X., Yuan, L., Li, Y., et\u00a0al. (2022). Regionclip: Region-based language-image pretraining. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 16793\u201316803.","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"2144_CR113","doi-asserted-by":"crossref","unstructured":"Zhou, X., Girdhar, R., Joulin, A., Kr\u00e4henb\u00fchl, P., & Misra, I. (2022). Detecting twenty-thousand classes using image-level supervision. In European conference on computer vision, pp. 350\u2013368. Springer.","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"2144_CR114","doi-asserted-by":"crossref","unstructured":"Zhou, Z., Lei, Y., Zhang, B., Liu, L., & Liu, Y. (2023). Zegclip: Towards adapting clip for zero-shot semantic segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 11175\u201311185.","DOI":"10.1109\/CVPR52729.2023.01075"},{"key":"2144_CR115","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., & Elhoseiny, M. (2023). Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592."},{"key":"2144_CR116","doi-asserted-by":"crossref","unstructured":"Zitnick, C.\u00a0L., & Doll\u00e1r, P. (2014). Edge boxes: Locating object proposals from edges. In Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13, pp. 391\u2013405. Springer.","DOI":"10.1007\/978-3-319-10602-1_26"},{"key":"2144_CR117","doi-asserted-by":"crossref","unstructured":"Zou, X., Dou, Z.-Y., Yang, J., Gan, Z., Li, L., Li, C., Dai, X., Behl, H., Wang, J., Yuan, L., et\u00a0al. (2023). Generalized decoding for pixel, image, and language. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 15116\u201315127.","DOI":"10.1109\/CVPR52729.2023.01451"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02144-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02144-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02144-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T05:21:56Z","timestamp":1729920116000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02144-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,13]]},"references-count":117,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2024,11]]}},"alternative-id":["2144"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02144-1","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,6,13]]},"assertion":[{"value":"14 October 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 May 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 June 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}